Re-generate QC8 IGEMM microkernels

PiperOrigin-RevId: 382424793
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
index b38a799..329a45f 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
@@ -20,9 +20,9 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,                  [sp] -> x10
-#     size_t a_offset,                   [sp + 8] -> x11
+#     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -38,18 +38,18 @@
 # C3   x7 v19 v23 v27 v31
 # temp    v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x8, x21 temp for Cortex-A53 loads
+# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
-        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
         CSEL    x16, x6,  x16, LO       //   c1 = c0
 
         ADD     x17, x16, x7            // c2 = c1 + cm_stride
-        LDP     x12, x8, [sp, 16]       // Load zero, params pointer
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
                                         // if mr <= 2
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
@@ -84,16 +84,16 @@
         LDP     x15, x20, [x4], 16
 
         CMP     x13, x12                // if a0 == zero
-        ADD     x13, x13, x11           // a0 += a_offset
+        ADD     x13, x13, x8            // a0 += a_offset
         CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
         CMP     x14, x12                // if a1 == zero
-        ADD     x14, x14, x11           // a1 += a_offset
+        ADD     x14, x14, x8            // a1 += a_offset
         CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
         CMP     x15, x12                // if a2 == zero
-        ADD     x15, x15, x11           // a2 += a_offset
+        ADD     x15, x15, x8            // a2 += a_offset
         CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
         CMP     x20, x12                // if a3 == zero
-        ADD     x20, x20, x11           // a3 += a_offset
+        ADD     x20, x20, x8            // a3 += a_offset
         CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
 
         # Is there at least 8 bytes for epilogue?
@@ -107,7 +107,7 @@
         LDR     d2, [x15], 8
         LDR     d3, [x20], 8
         SXTL    v0.8h, v0.8b
-        LDR     x8, [x5, 16]
+        LDR     x11, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v1.8h, v1.8b
         SXTL    v2.8h, v2.8b
@@ -130,7 +130,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -140,7 +140,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x8, [x5, 32]
+        LDR     x11, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -151,7 +151,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -161,7 +161,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x8, [x5, 48]
+        LDR     x11, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -172,7 +172,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -182,7 +182,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x8, [x5, 64]
+        LDR     x11, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -193,7 +193,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -203,7 +203,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x8, [x5, 80]
+        LDR     x11, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -214,7 +214,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -224,7 +224,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x8, [x5, 96]
+        LDR     x11, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -235,7 +235,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -246,7 +246,7 @@
         SMLAL   v27.4s, v5.4h, v3.h[5]
         SMLAL2  v31.4s, v5.8h, v3.h[5]
         SXTL    v6.8h, v6.8b
-        LDR     x8, [x5, 112]
+        LDR     x11, [x5, 112]
         SMLAL   v16.4s, v4.4h, v0.h[6]
         SMLAL2  v20.4s, v4.8h, v0.h[6]
         SMLAL   v17.4s, v4.4h, v1.h[6]
@@ -256,7 +256,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
@@ -266,7 +266,7 @@
 
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
-        LDR     x8, [x5]
+        LDR     x11, [x5]
         SMLAL   v27.4s, v6.4h, v3.h[6]
         SMLAL2  v31.4s, v6.8h, v3.h[6]
         SXTL    v5.8h, v5.8b
@@ -281,10 +281,10 @@
         SMLAL   v19.4s, v4.4h, v3.h[7]
         SMLAL2  v23.4s, v4.8h, v3.h[7]
         LDR     d6, [x5, 8]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[7]
         SMLAL2  v28.4s, v5.8h, v0.h[7]
-        LDR     x8, [x15], 8
+        LDR     x11, [x15], 8
         SMLAL   v25.4s, v5.4h, v1.h[7]
         SMLAL2  v29.4s, v5.8h, v1.h[7]
         LDR     d1, [x14], 8
@@ -294,11 +294,11 @@
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
         LDR     d3, [x20], 8
-        INS     v2.d[0], x8
+        INS     v2.d[0], x11
 
         SXTL    v0.8h, v0.8b
         SXTL    v1.8h, v1.8b
-        LDR     x8, [x5, 16]
+        LDR     x11, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v2.8h, v2.8b
         SUBS    x0, x0, 8
@@ -319,7 +319,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -329,7 +329,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x8, [x5, 32]
+        LDR     x11, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -340,7 +340,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -350,7 +350,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x8, [x5, 48]
+        LDR     x11, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -361,7 +361,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -371,7 +371,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x8, [x5, 64]
+        LDR     x11, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -382,7 +382,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -392,7 +392,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x8, [x5, 80]
+        LDR     x11, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -403,7 +403,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -413,7 +413,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x8, [x5, 96]
+        LDR     x11, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -424,7 +424,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -443,13 +443,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x8, [x5, 112]
+        LDR     x11, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SXTL    v4.8h, v4.8b
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
@@ -472,7 +472,7 @@
         AND     x0, x2, 7               // kc remainder 0 to 7
         SMLAL   v26.4s, v5.4h, v2.h[7]
         SMLAL2  v30.4s, v5.8h, v2.h[7]
-        LDR     x8, [sp, 40]            // reload params pointer
+        LDR     x11, [sp, 40]            // reload params pointer
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
 
@@ -548,7 +548,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        LD1R    {v2.8h}, [x11], 2        // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -567,18 +567,18 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        LD1R    {v0.16b}, [x11], 1       // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
+        LD1R    {v1.16b}, [x11]          // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
         SQXTN2  v7.16b, v27.8h
-        SUB     x8, x8, 3               // rewind params pointer
+        SUB     x11, x11, 3               // rewind params pointer
 
         SMAX    v4.16b, v4.16b, v0.16b
         SMAX    v5.16b, v5.16b, v0.16b
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
index d3da4b9..f7a0133 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -20,9 +20,9 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,                  [sp] -> x10
-#     size_t a_offset,                   [sp + 8] -> x11
+#     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -38,18 +38,18 @@
 # C3   x7 v19 v23 v27 v31
 # temp    v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x8, x21 temp for Cortex-A53 loads
+# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
-        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
         CSEL    x16, x6,  x16, LO       //   c1 = c0
 
         ADD     x17, x16, x7            // c2 = c1 + cm_stride
-        LDP     x12, x8, [sp, 16]       // Load zero, params pointer
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
                                         // if mr <= 2
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
@@ -84,16 +84,16 @@
         LDP     x15, x20, [x4], 16
 
         CMP     x13, x12                // if a0 == zero
-        ADD     x13, x13, x11           // a0 += a_offset
+        ADD     x13, x13, x8            // a0 += a_offset
         CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
         CMP     x14, x12                // if a1 == zero
-        ADD     x14, x14, x11           // a1 += a_offset
+        ADD     x14, x14, x8            // a1 += a_offset
         CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
         CMP     x15, x12                // if a2 == zero
-        ADD     x15, x15, x11           // a2 += a_offset
+        ADD     x15, x15, x8            // a2 += a_offset
         CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
         CMP     x20, x12                // if a3 == zero
-        ADD     x20, x20, x11           // a3 += a_offset
+        ADD     x20, x20, x8            // a3 += a_offset
         CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
 
         # Is there at least 8 bytes for epilogue?
@@ -107,7 +107,7 @@
         LDR     d2, [x15], 8
         LDR     d3, [x20], 8
         SXTL    v0.8h, v0.8b
-        LDR     x8, [x5, 16]
+        LDR     x11, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v1.8h, v1.8b
         SXTL    v2.8h, v2.8b
@@ -134,7 +134,7 @@
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         PRFM    PLDL1KEEP, [x20, 128]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         PRFM    PLDL1KEEP, [x5, 448]
@@ -146,7 +146,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x8, [x5, 32]
+        LDR     x11, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -157,7 +157,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -167,7 +167,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x8, [x5, 48]
+        LDR     x11, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -178,7 +178,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -188,7 +188,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x8, [x5, 64]
+        LDR     x11, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -199,7 +199,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -209,7 +209,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x8, [x5, 80]
+        LDR     x11, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -220,7 +220,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -230,7 +230,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x8, [x5, 96]
+        LDR     x11, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -241,7 +241,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -252,7 +252,7 @@
         SMLAL   v27.4s, v5.4h, v3.h[5]
         SMLAL2  v31.4s, v5.8h, v3.h[5]
         SXTL    v6.8h, v6.8b
-        LDR     x8, [x5, 112]
+        LDR     x11, [x5, 112]
         SMLAL   v16.4s, v4.4h, v0.h[6]
         SMLAL2  v20.4s, v4.8h, v0.h[6]
         SMLAL   v17.4s, v4.4h, v1.h[6]
@@ -262,7 +262,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
@@ -272,7 +272,7 @@
 
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
-        LDR     x8, [x5]
+        LDR     x11, [x5]
         SMLAL   v27.4s, v6.4h, v3.h[6]
         SMLAL2  v31.4s, v6.8h, v3.h[6]
         SXTL    v5.8h, v5.8b
@@ -287,10 +287,10 @@
         SMLAL   v19.4s, v4.4h, v3.h[7]
         SMLAL2  v23.4s, v4.8h, v3.h[7]
         LDR     d6, [x5, 8]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[7]
         SMLAL2  v28.4s, v5.8h, v0.h[7]
-        LDR     x8, [x15], 8
+        LDR     x11, [x15], 8
         SMLAL   v25.4s, v5.4h, v1.h[7]
         SMLAL2  v29.4s, v5.8h, v1.h[7]
         LDR     d1, [x14], 8
@@ -300,11 +300,11 @@
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
         LDR     d3, [x20], 8
-        INS     v2.d[0], x8
+        INS     v2.d[0], x11
 
         SXTL    v0.8h, v0.8b
         SXTL    v1.8h, v1.8b
-        LDR     x8, [x5, 16]
+        LDR     x11, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v2.8h, v2.8b
         SUBS    x0, x0, 8
@@ -325,7 +325,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -335,7 +335,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x8, [x5, 32]
+        LDR     x11, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -346,7 +346,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -356,7 +356,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x8, [x5, 48]
+        LDR     x11, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -367,7 +367,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -377,7 +377,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x8, [x5, 64]
+        LDR     x11, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -388,7 +388,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x8
+        INS     v5.d[0], x11
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -398,7 +398,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x8, [x5, 80]
+        LDR     x11, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -409,7 +409,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x8
+        INS     v6.d[0], x11
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -419,7 +419,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x8, [x5, 96]
+        LDR     x11, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -430,7 +430,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -449,13 +449,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x8, [x5, 112]
+        LDR     x11, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x8
+        INS     v4.d[0], x11
         SXTL    v4.8h, v4.8b
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
@@ -478,7 +478,7 @@
         AND     x0, x2, 7               // kc remainder 0 to 7
         SMLAL   v26.4s, v5.4h, v2.h[7]
         SMLAL2  v30.4s, v5.8h, v2.h[7]
-        LDR     x8, [sp, 40]            // reload params pointer
+        LDR     x11, [sp, 40]            // reload params pointer
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
 
@@ -554,7 +554,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        LD1R    {v2.8h}, [x11], 2        // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -573,18 +573,18 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        LD1R    {v0.16b}, [x11], 1       // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
+        LD1R    {v1.16b}, [x11]          // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
         SQXTN2  v7.16b, v27.8h
-        SUB     x8, x8, 3               // rewind params pointer
+        SUB     x11, x11, 3               // rewind params pointer
 
         SMAX    v4.16b, v4.16b, v0.16b
         SMAX    v5.16b, v5.16b, v0.16b
diff --git a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
index 65aeb9e..ec74ac8 100644
--- a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+++ b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -20,9 +20,9 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,                  [sp] -> (x0)
-#     size_t a_offset,                   [sp + 8] -> x11
+#     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x11)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -38,15 +38,15 @@
 # C3   x7 v19 v23 v27 v31
 # unused v12 v13 v14 v15
 
-# x8 temp for Cortex-A55 loads
+# x11 temp for Cortex-A55 loads
 
 BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
-        LDR     x11, [sp, 8]            // Load a_offset
+        LDR     x8, [sp, 8]             // Load a_offset
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
-        LDP     x12, x8, [sp, 16]       // Load zero, params pointer
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
         CSEL    x16, x6,  x16, LO       //   c1 = c0
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
         STP     d8,  d9, [sp, -32]!     // Save d8-d11 on stack
@@ -87,16 +87,16 @@
         LDP     x15, x10, [x4], 16
 
         CMP     x13, x12                // if a0 == zero
-        ADD     x13, x13, x11           // a0 += a_offset
+        ADD     x13, x13, x8            // a0 += a_offset
         CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
         CMP     x14, x12                // if a1 == zero
-        ADD     x14, x14, x11           // a1 += a_offset
+        ADD     x14, x14, x8            // a1 += a_offset
         CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
         CMP     x15, x12                // if a2 == zero
-        ADD     x15, x15, x11           // a2 += a_offset
+        ADD     x15, x15, x8            // a2 += a_offset
         CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
         CMP     x10, x12                // if a3 == zero
-        ADD     x10, x10, x11           // a3 += a_offset
+        ADD     x10, x10, x8            // a3 += a_offset
         CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
 
         # Is there at least 16 bytes for prologue/epilogue?
@@ -111,7 +111,7 @@
         LDR     d3, [x10], 8
         SUBS    x0, x0, 16              // is there 16 for main loop?
         LDR     d9,  [x5], 8
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         # Is there at least 16 bytes for main loop?
         B.LO    3f
 
@@ -127,9 +127,9 @@
         SDOT    v16.4s,  v8.16b, v0.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[0]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[0]
         LDR     d4,  [x13], 8
 
@@ -137,9 +137,9 @@
         SDOT    v20.4s,  v9.16b, v0.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[0]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[0]
         LDR     d5, [x14], 8
 
@@ -147,9 +147,9 @@
         SDOT    v24.4s, v10.16b, v0.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[0]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[0]
         LDR     d6, [x15], 8
 
@@ -157,9 +157,9 @@
         SDOT    v28.4s, v11.16b, v0.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[0]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[0]
         LDR     d7,  [x10], 8
 
@@ -167,45 +167,45 @@
         SDOT    v16.4s,  v8.16b, v0.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[1]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[1]
 
         # BLOCK 1
         SDOT    v20.4s,  v9.16b, v0.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[1]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[1]
 
         # BLOCK 2
         SDOT    v24.4s, v10.16b, v0.4b[1]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[1]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[1]
 
         # BLOCK 4
         SDOT    v28.4s, v11.16b, v0.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[1]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[1]
 
         # BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[0]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[0]
         LDR     d0,  [x13], 8
 
@@ -213,9 +213,9 @@
         SDOT    v20.4s,  v9.16b, v4.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[0]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[0]
         LDR     d1, [x14], 8
 
@@ -223,9 +223,9 @@
         SDOT    v24.4s, v10.16b, v4.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v5.4b[0]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[0]
         LDR     d2, [x15], 8
 
@@ -233,9 +233,9 @@
         SDOT    v28.4s, v11.16b, v4.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[0]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[0]
         LDR     d3,  [x10], 8
 
@@ -243,27 +243,27 @@
         SDOT    v16.4s,  v8.16b, v4.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[1]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[1]
 
         # BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[1]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[1]
 
         # BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[1]
         LDR     d8,  [x5], 8            // First B values for block 0 and 1
         SDOT    v25.4s, v10.16b, v5.4b[1]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[1]
         SUBS    x0, x0, 16
 
@@ -271,9 +271,9 @@
         SDOT    v28.4s, v11.16b, v4.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[1]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[1]
         B.HS    2b
 
@@ -283,9 +283,9 @@
         SDOT    v16.4s,  v8.16b, v0.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[0]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[0]
         LDR     d4,  [x13], 8
 
@@ -293,9 +293,9 @@
         SDOT    v20.4s,  v9.16b, v0.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[0]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[0]
         LDR     d5, [x14], 8
 
@@ -303,9 +303,9 @@
         SDOT    v24.4s, v10.16b, v0.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[0]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[0]
         LDR     d6, [x15], 8
 
@@ -313,9 +313,9 @@
         SDOT    v28.4s, v11.16b, v0.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[0]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v2.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[0]
         LDR     d7,  [x10], 8
 
@@ -323,96 +323,96 @@
         SDOT    v16.4s,  v8.16b, v0.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[1]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[1]
 
         # BLOCK 1
         SDOT    v20.4s,  v9.16b, v0.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[1]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[1]
 
         # BLOCK 2
         SDOT    v24.4s, v10.16b, v0.4b[1]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[1]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[1]
 
         # BLOCK 4
         SDOT    v28.4s, v11.16b, v0.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[1]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v2.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[1]
 
         # BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[0]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[0]
 
         # BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[0]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[0]
 
         # BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v5.4b[0]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[0]
 
         # BLOCK 3
         SDOT    v28.4s, v11.16b, v4.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[0]
-        INS     v8.d[1], x8
+        INS     v8.d[1], x11
         SDOT    v30.4s, v11.16b, v6.4b[0]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[0]
 
         # BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[1]
-        INS     v9.d[1], x8
+        INS     v9.d[1], x11
         SDOT    v18.4s,  v8.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[1]
 
         # BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[1]
-        INS     v10.d[1], x8
+        INS     v10.d[1], x11
         SDOT    v22.4s,  v9.16b, v6.4b[1]
-        LDR     x8,  [x5], 8
+        LDR     x11,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[1]
 
         # BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[1]
         SDOT    v25.4s, v10.16b, v5.4b[1]
-        INS     v11.d[1], x8
+        INS     v11.d[1], x11
         SDOT    v26.4s, v10.16b, v6.4b[1]
         SDOT    v27.4s, v10.16b, v7.4b[1]
         AND     x0, x2, 15              // kc remainder 0 to 12
@@ -420,7 +420,7 @@
         # BLOCK 3
         SDOT    v28.4s, v11.16b, v4.4b[1]
         SDOT    v29.4s, v11.16b, v5.4b[1]
-        LDR     x8, [sp, 56]            // reload params pointer
+        LDR     x11, [sp, 56]            // reload params pointer
         SDOT    v30.4s, v11.16b, v6.4b[1]
         SDOT    v31.4s, v11.16b, v7.4b[1]
 
@@ -497,7 +497,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2       // add bias
+        LD1R    {v2.8h}, [x11], 2       // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -516,21 +516,21 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1      // clamp min value
+        LD1R    {v0.16b}, [x11], 1      // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]         // clamp max value
+        LD1R    {v1.16b}, [x11]         // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
         SQXTN2  v7.16b, v27.8h
-        SUB     x8, x8, 3               // rewind params pointer
+        SUB     x11, x11, 3               // rewind params pointer
         SMAX    v4.16b, v4.16b, v0.16b
         SMAX    v5.16b, v5.16b, v0.16b
-        LDR     x0, [sp, 32]           // cn_stride
+        LDR     x0, [sp, 32]            // cn_stride
         SMAX    v6.16b, v6.16b, v0.16b
         SMAX    v7.16b, v7.16b, v0.16b
         SUBS    x1, x1, 16
diff --git a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
index 7b68775..252aa92 100644
--- a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
+++ b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
@@ -20,9 +20,9 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,                  [sp] -> x10
-#     size_t a_offset,                   [sp + 8] -> x11
+#     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x11)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -30,7 +30,7 @@
 # A0  x13  v0
 # A1  x14  v1
 # A2  x15  v2
-# A3   x8  v3
+# A3   x11  v3
 # B    x5  v4  v5  v6  v7
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
@@ -42,7 +42,7 @@
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
-        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
         CSEL    x16, x6,  x16, LO       //   c1 = c0
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
@@ -80,20 +80,20 @@
 1:
         # Load next 4 A pointers
         LDP     x13, x14, [x4], 16
-        LDP     x15, x8, [x4], 16
+        LDP     x15, x11, [x4], 16
 
         CMP     x13, x12                // if a0 == zero
-        ADD     x13, x13, x11           // a0 += a_offset
+        ADD     x13, x13, x8            // a0 += a_offset
         CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
         CMP     x14, x12                // if a1 == zero
-        ADD     x14, x14, x11           // a1 += a_offset
+        ADD     x14, x14, x8            // a1 += a_offset
         CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
         CMP     x15, x12                // if a2 == zero
-        ADD     x15, x15, x11           // a2 += a_offset
+        ADD     x15, x15, x8            // a2 += a_offset
         CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
-        CMP     x8, x12                 // if a3 == zero
-        ADD     x8, x8, x11             // a3 += a_offset
-        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
+        CMP     x11, x12                 // if a3 == zero
+        ADD     x11, x11, x8             // a3 += a_offset
+        CSEL    x11, x12, x11, EQ         //   a3 = zero, else += a3 + a_offset
 
         # Is there at least 16 bytes for main loop?
         SUBS    x0, x2, 16              // k = kc - 16
@@ -106,7 +106,7 @@
         LDR     q4,  [x5], 16
         LDR     q1, [x14], 16
         LDR     q2, [x15], 16
-        LDR     q3,  [x8], 16
+        LDR     q3,  [x11], 16
         LDR     q5,  [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
         SDOT    v17.4s, v4.16b,  v1.4b[0]
@@ -194,7 +194,7 @@
         SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
         B.HI    1b
 
-        LDR     x8, [sp, 24]            // reload params pointer
+        LDR     x11, [sp, 24]            // reload params pointer
 
         SCVTF   v16.4s, v16.4s
         SCVTF   v17.4s, v17.4s
@@ -261,7 +261,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2       // add bias
+        LD1R    {v2.8h}, [x11], 2       // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -280,13 +280,13 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1      // clamp min value
+        LD1R    {v0.16b}, [x11], 1      // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]         // clamp max value
+        LD1R    {v1.16b}, [x11]         // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
@@ -325,7 +325,7 @@
         LDR     q4,  [x5], 16
         LDR     d1, [x14], 8
         LDR     d2, [x15], 8
-        LDR     d3,  [x8], 8
+        LDR     d3,  [x11], 8
         LDR     q5,  [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
         SDOT    v17.4s, v4.16b,  v1.4b[0]
@@ -372,7 +372,7 @@
         LDR     q4, [x5], 16
         LDR     s1, [x14], 4
         LDR     s2, [x15], 4
-        LDR     s3,  [x8], 4
+        LDR     s3,  [x11], 4
         LDR     q5, [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
         SDOT    v17.4s, v4.16b,  v1.4b[0]
@@ -383,7 +383,7 @@
         SDOT    v21.4s, v5.16b,  v1.4b[0]
         SDOT    v22.4s, v5.16b,  v2.4b[0]
         SDOT    v23.4s, v5.16b,  v3.4b[0]
-        LDR     x8, [sp, 24]            // reload params pointer
+        LDR     x11, [sp, 24]            // reload params pointer
         SDOT    v24.4s, v6.16b, v0.4b[0]
         SDOT    v25.4s, v6.16b, v1.4b[0]
         SDOT    v26.4s, v6.16b, v2.4b[0]
diff --git a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
index 7b2db1c..716cbf2 100644
--- a/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+++ b/src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -20,9 +20,9 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,                  [sp] -> x10
-#     size_t a_offset,                   [sp + 8] -> x11
+#     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x11)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -30,7 +30,7 @@
 # A0  x13  v0
 # A1  x14  v1
 # A2  x15  v2
-# A3   x8  v3
+# A3   x11  v3
 # B    x5  v4  v5  v6  v7
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
@@ -42,7 +42,7 @@
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
-        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
         CSEL    x16, x6,  x16, LO       //   c1 = c0
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
@@ -80,20 +80,20 @@
 1:
         # Load next 4 A pointers
         LDP     x13, x14, [x4], 16
-        LDP     x15, x8, [x4], 16
+        LDP     x15, x11, [x4], 16
 
         CMP     x13, x12                // if a0 == zero
-        ADD     x13, x13, x11           // a0 += a_offset
+        ADD     x13, x13, x8            // a0 += a_offset
         CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
         CMP     x14, x12                // if a1 == zero
-        ADD     x14, x14, x11           // a1 += a_offset
+        ADD     x14, x14, x8            // a1 += a_offset
         CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
         CMP     x15, x12                // if a2 == zero
-        ADD     x15, x15, x11           // a2 += a_offset
+        ADD     x15, x15, x8            // a2 += a_offset
         CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
-        CMP     x8, x12                 // if a3 == zero
-        ADD     x8, x8, x11             // a3 += a_offset
-        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
+        CMP     x11, x12                // if a3 == zero
+        ADD     x11, x11, x8            // a3 += a_offset
+        CSEL    x11, x12, x11, EQ       //   a3 = zero, else += a3 + a_offset
 
         # Is there at least 8 bytes for main loop?
         SUBS    x0, x2, 8               // k = kc - 8
@@ -106,7 +106,7 @@
         LDR     q4,  [x5], 16
         LDR     d1, [x14], 8
         LDR     d2, [x15], 8
-        LDR     d3, [x8], 8
+        LDR     d3, [x11], 8
         LDR     q5,  [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
         SDOT    v17.4s, v4.16b,  v1.4b[0]
@@ -149,7 +149,7 @@
         # Is there a remainder?- 4 bytes of A
         TBNZ    x0, 2, 4f
 
-        LDR     x8, [sp, 24]            // reload params pointer
+        LDR     x11, [sp, 24]           // reload params pointer
 
         # ks loop
         SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
@@ -220,7 +220,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        LD1R    {v2.8h}, [x11], 2        // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -239,13 +239,13 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        LD1R    {v0.16b}, [x11], 1       // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
+        LD1R    {v1.16b}, [x11]          // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
@@ -281,7 +281,7 @@
         LDR     q4, [x5], 16
         LDR     s1, [x14], 4
         LDR     s2, [x15], 4
-        LDR     s3, [x8], 4
+        LDR     s3, [x11], 4
         LDR     q5, [x5], 16
         SDOT    v16.4s, v4.16b,  v0.4b[0]
         SDOT    v17.4s, v4.16b,  v1.4b[0]
@@ -292,7 +292,7 @@
         SDOT    v21.4s, v5.16b,  v1.4b[0]
         SDOT    v22.4s, v5.16b,  v2.4b[0]
         SDOT    v23.4s, v5.16b,  v3.4b[0]
-        LDR     x8, [sp, 24]            // reload params pointer
+        LDR     x11, [sp, 24]            // reload params pointer
         SDOT    v24.4s, v6.16b, v0.4b[0]
         SDOT    v25.4s, v6.16b, v1.4b[0]
         SDOT    v26.4s, v6.16b, v2.4b[0]