4x16 QS8 IGEMM microkernels use x8 for temp

PiperOrigin-RevId: 375180818
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
index fd79cbd..fe12a92 100644
--- a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
@@ -17,7 +17,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_f32_minmax_params params [sp + 24] -> x8
+#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +33,7 @@
 # C3   x7 v19 v23 v27 v31
 # temp    v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
 
@@ -49,9 +49,9 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -32]!    // Save x20-x23 on stack
-        STP     x22, x23, [sp, 16]
+        STP     x20, x21, [sp, -32]!    // Save x20-x22 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        STR     x22, [sp, 16]
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
         .p2align 3
@@ -103,7 +103,7 @@
         LDR     d2, [x15], 8
         LDR     d3, [x20], 8
         SXTL    v0.8h, v0.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v1.8h, v1.8b
         SXTL    v2.8h, v2.8b
@@ -134,7 +134,7 @@
         $if PREFETCH:
           PRFM    PLDL1KEEP, [x20, 128]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         $if PREFETCH:
@@ -148,7 +148,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -159,7 +159,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -169,7 +169,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -180,7 +180,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -190,7 +190,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -201,7 +201,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -211,7 +211,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -222,7 +222,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -232,7 +232,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -243,7 +243,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -262,13 +262,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         ADD     x5, x5, 128
 
@@ -284,7 +284,7 @@
         SXTL    v5.8h, v5.8b
         SMLAL   v18.4s, v4.4h, v2.h[7]
         SMLAL2  v22.4s, v4.8h, v2.h[7]
-        LDR     x23, [x5]
+        LDR     x8, [x5]
         SMLAL   v19.4s, v4.4h, v3.h[7]
         SMLAL2  v23.4s, v4.8h, v3.h[7]
         LDR     x21, [x13], 8
@@ -304,13 +304,13 @@
         LDR     d3, [x20], 8
         INS     v2.d[0], x22
         LDR     d6, [x5, 8]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v0.8h, v0.8b
         SXTL    v1.8h, v1.8b
         SUBS    x0, x0, 8
         SXTL    v4.8h, v4.8b
         SXTL    v2.8h, v2.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v3.8h, v3.8b
         SXTL    v6.8h, v6.8b
         B.HS    2b
@@ -328,7 +328,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -338,7 +338,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -349,7 +349,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -359,7 +359,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -370,7 +370,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -380,7 +380,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -391,7 +391,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -401,7 +401,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -412,7 +412,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -422,7 +422,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -433,7 +433,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -452,13 +452,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
@@ -478,9 +478,10 @@
         SMLAL2  v28.4s, v5.8h, v0.h[7]
         SMLAL   v25.4s, v5.4h, v1.h[7]
         SMLAL2  v29.4s, v5.8h, v1.h[7]
-        AND     x0, x2, 7              // kc remainder 0 to 7
+        AND     x0, x2, 7               // kc remainder 0 to 7
         SMLAL   v26.4s, v5.4h, v2.h[7]
         SMLAL2  v30.4s, v5.8h, v2.h[7]
+        LDR     x8, [sp, 56]            // reload params pointer
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
 
@@ -635,8 +636,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET
 
@@ -842,8 +843,8 @@
         ST1     {v5.b}[0], [x16]
         ST1     {v4.b}[0], [x6]
 9:
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET
 
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
index 746e5d0..0774357 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -17,7 +17,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_f32_minmax_params params [sp + 24] -> x8
+#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +33,7 @@
 # C3   x7 v19 v23 v27 v31
 # unused v12 v13 v14 v15
 
-# x21 temp for Cortex-A55 loads
+# x8 temp for Cortex-A55 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
@@ -43,7 +43,7 @@
         ADD     x16, x6, x7             // c1 = c0 + cm_stride
         LDP     x12, x8, [sp, 16]       // Load zero, params pointer
         CSEL    x16, x6,  x16, LO       //   c1 = c0
-        STP     x20, x21, [sp, -48]!    // Save x20-x21 on stack
+        STR     x20, [sp, -48]!         // Save x20 on stack
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
         STP     d8,  d9, [sp, 16]       // Save d8-d11 on stack
 
@@ -107,7 +107,7 @@
         LDR     d3, [x20], 8
         SUBS    x0, x0, 16              // is there 16 for main loop?
         LDR     d9,  [x5], 8
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         # Is there at least 16 bytes for main loop?
         B.LO    3f
 
@@ -123,9 +123,9 @@
         SDOT    v16.4s,  v8.16b, v0.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[0]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[0]
         LDR     d4,  [x13], 8
 
@@ -133,9 +133,9 @@
         SDOT    v20.4s,  v9.16b, v0.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[0]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[0]
         LDR     d5, [x14], 8
 
@@ -143,9 +143,9 @@
         SDOT    v24.4s, v10.16b, v0.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[0]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[0]
         LDR     d6, [x15], 8
 
@@ -153,9 +153,9 @@
         SDOT    v28.4s, v11.16b, v0.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[0]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[0]
         LDR     d7,  [x20], 8
 
@@ -163,45 +163,45 @@
         SDOT    v16.4s,  v8.16b, v0.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[1]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[1]
 
         // BLOCK 1
         SDOT    v20.4s,  v9.16b, v0.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[1]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[1]
 
         // BLOCK 2
         SDOT    v24.4s, v10.16b, v0.4b[1]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[1]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[1]
 
         // BLOCK 4
         SDOT    v28.4s, v11.16b, v0.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[1]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[1]
 
         // BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[0]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[0]
         LDR     d0,  [x13], 8
 
@@ -209,9 +209,9 @@
         SDOT    v20.4s,  v9.16b, v4.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[0]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[0]
         LDR     d1, [x14], 8
 
@@ -219,9 +219,9 @@
         SDOT    v24.4s, v10.16b, v4.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v5.4b[0]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[0]
         LDR     d2, [x15], 8
 
@@ -229,9 +229,9 @@
         SDOT    v28.4s, v11.16b, v4.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[0]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[0]
         LDR     d3,  [x20], 8
 
@@ -239,27 +239,27 @@
         SDOT    v16.4s,  v8.16b, v4.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[1]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[1]
 
         // BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[1]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[1]
 
         // BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[1]
         LDR     d8,  [x5], 8            // First B values for block 0 and 1
         SDOT    v25.4s, v10.16b, v5.4b[1]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[1]
         SUBS    x0, x0, 16
 
@@ -267,9 +267,9 @@
         SDOT    v28.4s, v11.16b, v4.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[1]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[1]
         B.HS    2b
 
@@ -279,9 +279,9 @@
         SDOT    v16.4s,  v8.16b, v0.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[0]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[0]
         LDR     d4,  [x13], 8
 
@@ -289,9 +289,9 @@
         SDOT    v20.4s,  v9.16b, v0.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[0]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[0]
         LDR     d5, [x14], 8
 
@@ -299,9 +299,9 @@
         SDOT    v24.4s, v10.16b, v0.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[0]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[0]
         LDR     d6, [x15], 8
 
@@ -309,9 +309,9 @@
         SDOT    v28.4s, v11.16b, v0.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[0]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v2.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[0]
         LDR     d7,  [x20], 8
 
@@ -319,96 +319,96 @@
         SDOT    v16.4s,  v8.16b, v0.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v1.4b[1]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v3.4b[1]
 
         // BLOCK 1
         SDOT    v20.4s,  v9.16b, v0.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v1.4b[1]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v3.4b[1]
 
         // BLOCK 2
         SDOT    v24.4s, v10.16b, v0.4b[1]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v1.4b[1]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v3.4b[1]
 
         // BLOCK 4
         SDOT    v28.4s, v11.16b, v0.4b[1]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v1.4b[1]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v2.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v3.4b[1]
 
         // BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[0]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[0]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[0]
 
         // BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[0]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[0]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[0]
 
         // BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[0]
         LDR     d8,  [x5], 8
         SDOT    v25.4s, v10.16b, v5.4b[0]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v27.4s, v10.16b, v7.4b[0]
 
         // BLOCK 3
         SDOT    v28.4s, v11.16b, v4.4b[0]
         LDR     d9,  [x5], 8
         SDOT    v29.4s, v11.16b, v5.4b[0]
-        INS     v8.d[1], x21
+        INS     v8.d[1], x8
         SDOT    v30.4s, v11.16b, v6.4b[0]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v31.4s, v11.16b, v7.4b[0]
 
         // BLOCK 0
         SDOT    v16.4s,  v8.16b, v4.4b[1]
         LDR     d10,  [x5], 8
         SDOT    v17.4s,  v8.16b, v5.4b[1]
-        INS     v9.d[1], x21
+        INS     v9.d[1], x8
         SDOT    v18.4s,  v8.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v19.4s,  v8.16b, v7.4b[1]
 
         // BLOCK 1
         SDOT    v20.4s,  v9.16b, v4.4b[1]
         LDR     d11,  [x5], 8
         SDOT    v21.4s,  v9.16b, v5.4b[1]
-        INS     v10.d[1], x21
+        INS     v10.d[1], x8
         SDOT    v22.4s,  v9.16b, v6.4b[1]
-        LDR     x21,  [x5], 8
+        LDR     x8,  [x5], 8
         SDOT    v23.4s,  v9.16b, v7.4b[1]
 
         // BLOCK 2
         SDOT    v24.4s, v10.16b, v4.4b[1]
         SDOT    v25.4s, v10.16b, v5.4b[1]
-        INS     v11.d[1], x21
+        INS     v11.d[1], x8
         SDOT    v26.4s, v10.16b, v6.4b[1]
         SDOT    v27.4s, v10.16b, v7.4b[1]
         AND     x0, x2, 15              // kc remainder 0 to 12
@@ -416,6 +416,7 @@
         // BLOCK 3
         SDOT    v28.4s, v11.16b, v4.4b[1]
         SDOT    v29.4s, v11.16b, v5.4b[1]
+        LDR     x8, [sp, 72]            // reload params pointer
         SDOT    v30.4s, v11.16b, v6.4b[1]
         SDOT    v31.4s, v11.16b, v7.4b[1]
 
@@ -570,8 +571,8 @@
         LDP     d10, d11, [sp, 32]
         LDP     d8,  d9, [sp, 16]
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 48
+        # Restore x20 from stack
+        LDR     x20, [sp], 48
         RET
 
         # Remainder- 4 to 12 bytes of A
@@ -687,8 +688,8 @@
         LDP     d10, d11, [sp, 32]
         LDP     d8,  d9, [sp, 16]
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 48
+        # Restore x20 from stack
+        LDR     x20, [sp], 48
         RET
 
 END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
diff --git a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
index 9084aea..7d502ae 100644
--- a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
@@ -21,7 +21,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_f32_minmax_params params [sp + 24] -> x8
+#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -37,7 +37,7 @@
 # C3   x7 v19 v23 v27 v31
 # temp    v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
@@ -53,9 +53,9 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -32]!    // Save x20-x23 on stack
-        STP     x22, x23, [sp, 16]
+        STP     x20, x21, [sp, -32]!    // Save x20-x22 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        STR     x22, [sp, 16]
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
         .p2align 3
@@ -107,7 +107,7 @@
         LDR     d2, [x15], 8
         LDR     d3, [x20], 8
         SXTL    v0.8h, v0.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v1.8h, v1.8b
         SXTL    v2.8h, v2.8b
@@ -130,7 +130,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -140,7 +140,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -151,7 +151,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -161,7 +161,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -172,7 +172,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -182,7 +182,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -193,7 +193,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -203,7 +203,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -214,7 +214,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -224,7 +224,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -235,7 +235,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -254,13 +254,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         ADD     x5, x5, 128
 
@@ -276,7 +276,7 @@
         SXTL    v5.8h, v5.8b
         SMLAL   v18.4s, v4.4h, v2.h[7]
         SMLAL2  v22.4s, v4.8h, v2.h[7]
-        LDR     x23, [x5]
+        LDR     x8, [x5]
         SMLAL   v19.4s, v4.4h, v3.h[7]
         SMLAL2  v23.4s, v4.8h, v3.h[7]
         LDR     x21, [x13], 8
@@ -296,13 +296,13 @@
         LDR     d3, [x20], 8
         INS     v2.d[0], x22
         LDR     d6, [x5, 8]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v0.8h, v0.8b
         SXTL    v1.8h, v1.8b
         SUBS    x0, x0, 8
         SXTL    v4.8h, v4.8b
         SXTL    v2.8h, v2.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v3.8h, v3.8b
         SXTL    v6.8h, v6.8b
         B.HS    2b
@@ -320,7 +320,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -330,7 +330,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -341,7 +341,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -351,7 +351,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -362,7 +362,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -372,7 +372,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -383,7 +383,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -393,7 +393,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -404,7 +404,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -414,7 +414,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -425,7 +425,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -444,13 +444,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
@@ -470,9 +470,10 @@
         SMLAL2  v28.4s, v5.8h, v0.h[7]
         SMLAL   v25.4s, v5.4h, v1.h[7]
         SMLAL2  v29.4s, v5.8h, v1.h[7]
-        AND     x0, x2, 7              // kc remainder 0 to 7
+        AND     x0, x2, 7               // kc remainder 0 to 7
         SMLAL   v26.4s, v5.4h, v2.h[7]
         SMLAL2  v30.4s, v5.8h, v2.h[7]
+        LDR     x8, [sp, 56]            // reload params pointer
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
 
@@ -627,8 +628,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET
 
@@ -834,8 +835,8 @@
         ST1     {v5.b}[0], [x16]
         ST1     {v4.b}[0], [x6]
 9:
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET
 
diff --git a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
index 65e9a61..a235a7f 100644
--- a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -21,7 +21,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_f32_minmax_params params [sp + 24] -> x8
+#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -37,7 +37,7 @@
 # C3   x7 v19 v23 v27 v31
 # temp    v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
@@ -53,9 +53,9 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -32]!    // Save x20-x23 on stack
-        STP     x22, x23, [sp, 16]
+        STP     x20, x21, [sp, -32]!    // Save x20-x22 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        STR     x22, [sp, 16]
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
         .p2align 3
@@ -107,7 +107,7 @@
         LDR     d2, [x15], 8
         LDR     d3, [x20], 8
         SXTL    v0.8h, v0.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v4.8h, v4.8b
         SXTL    v1.8h, v1.8b
         SXTL    v2.8h, v2.8b
@@ -134,7 +134,7 @@
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         PRFM    PLDL1KEEP, [x20, 128]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         PRFM    PLDL1KEEP, [x5, 448]
@@ -146,7 +146,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -157,7 +157,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -167,7 +167,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -178,7 +178,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -188,7 +188,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -199,7 +199,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -209,7 +209,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -220,7 +220,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -230,7 +230,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -241,7 +241,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -260,13 +260,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         ADD     x5, x5, 128
 
@@ -282,7 +282,7 @@
         SXTL    v5.8h, v5.8b
         SMLAL   v18.4s, v4.4h, v2.h[7]
         SMLAL2  v22.4s, v4.8h, v2.h[7]
-        LDR     x23, [x5]
+        LDR     x8, [x5]
         SMLAL   v19.4s, v4.4h, v3.h[7]
         SMLAL2  v23.4s, v4.8h, v3.h[7]
         LDR     x21, [x13], 8
@@ -302,13 +302,13 @@
         LDR     d3, [x20], 8
         INS     v2.d[0], x22
         LDR     d6, [x5, 8]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v0.8h, v0.8b
         SXTL    v1.8h, v1.8b
         SUBS    x0, x0, 8
         SXTL    v4.8h, v4.8b
         SXTL    v2.8h, v2.8b
-        LDR     x23, [x5, 16]
+        LDR     x8, [x5, 16]
         SXTL    v3.8h, v3.8b
         SXTL    v6.8h, v6.8b
         B.HS    2b
@@ -326,7 +326,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[0]
         SMLAL2  v23.4s, v4.8h, v3.h[0]
         LDR     d4, [x5, 24]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[0]
         SMLAL2  v28.4s, v6.8h, v0.h[0]
         SMLAL   v25.4s, v6.4h, v1.h[0]
@@ -336,7 +336,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[0]
         SMLAL   v27.4s, v6.4h, v3.h[0]
         SMLAL2  v31.4s, v6.8h, v3.h[0]
-        LDR     x23, [x5, 32]
+        LDR     x8, [x5, 32]
         SMLAL   v16.4s, v5.4h, v0.h[1]
         SMLAL2  v20.4s, v5.8h, v0.h[1]
         SMLAL   v17.4s, v5.4h, v1.h[1]
@@ -347,7 +347,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[1]
         SMLAL2  v23.4s, v5.8h, v3.h[1]
         LDR     d5, [x5, 40]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[1]
         SMLAL2  v28.4s, v4.8h, v0.h[1]
         SMLAL   v25.4s, v4.4h, v1.h[1]
@@ -357,7 +357,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[1]
         SMLAL   v27.4s, v4.4h, v3.h[1]
         SMLAL2  v31.4s, v4.8h, v3.h[1]
-        LDR     x23, [x5, 48]
+        LDR     x8, [x5, 48]
         SMLAL   v16.4s, v6.4h, v0.h[2]
         SMLAL2  v20.4s, v6.8h, v0.h[2]
         SMLAL   v17.4s, v6.4h, v1.h[2]
@@ -368,7 +368,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[2]
         SMLAL2  v23.4s, v6.8h, v3.h[2]
         LDR     d6, [x5, 56]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[2]
         SMLAL2  v28.4s, v5.8h, v0.h[2]
         SMLAL   v25.4s, v5.4h, v1.h[2]
@@ -378,7 +378,7 @@
         SMLAL2  v30.4s, v5.8h, v2.h[2]
         SMLAL   v27.4s, v5.4h, v3.h[2]
         SMLAL2  v31.4s, v5.8h, v3.h[2]
-        LDR     x23, [x5, 64]
+        LDR     x8, [x5, 64]
         SMLAL   v16.4s, v4.4h, v0.h[3]
         SMLAL2  v20.4s, v4.8h, v0.h[3]
         SMLAL   v17.4s, v4.4h, v1.h[3]
@@ -389,7 +389,7 @@
         SMLAL   v19.4s, v4.4h, v3.h[3]
         SMLAL2  v23.4s, v4.8h, v3.h[3]
         LDR     d4, [x5, 72]
-        INS     v5.d[0], x23
+        INS     v5.d[0], x8
         SMLAL   v24.4s, v6.4h, v0.h[3]
         SMLAL2  v28.4s, v6.8h, v0.h[3]
         SXTL    v5.8h, v5.8b
@@ -399,7 +399,7 @@
         SMLAL2  v30.4s, v6.8h, v2.h[3]
         SMLAL   v27.4s, v6.4h, v3.h[3]
         SMLAL2  v31.4s, v6.8h, v3.h[3]
-        LDR     x23, [x5, 80]
+        LDR     x8, [x5, 80]
         SMLAL   v16.4s, v5.4h, v0.h[4]
         SMLAL2  v20.4s, v5.8h, v0.h[4]
         SMLAL   v17.4s, v5.4h, v1.h[4]
@@ -410,7 +410,7 @@
         SMLAL   v19.4s, v5.4h, v3.h[4]
         SMLAL2  v23.4s, v5.8h, v3.h[4]
         LDR     d5, [x5, 88]
-        INS     v6.d[0], x23
+        INS     v6.d[0], x8
         SMLAL   v24.4s, v4.4h, v0.h[4]
         SMLAL2  v28.4s, v4.8h, v0.h[4]
         SMLAL   v25.4s, v4.4h, v1.h[4]
@@ -420,7 +420,7 @@
         SMLAL2  v30.4s, v4.8h, v2.h[4]
         SMLAL   v27.4s, v4.4h, v3.h[4]
         SMLAL2  v31.4s, v4.8h, v3.h[4]
-        LDR     x23, [x5, 96]
+        LDR     x8, [x5, 96]
         SMLAL   v16.4s, v6.4h, v0.h[5]
         SMLAL2  v20.4s, v6.8h, v0.h[5]
         SMLAL   v17.4s, v6.4h, v1.h[5]
@@ -431,7 +431,7 @@
         SMLAL   v19.4s, v6.4h, v3.h[5]
         SMLAL2  v23.4s, v6.8h, v3.h[5]
         LDR     d6, [x5, 104]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SMLAL   v24.4s, v5.4h, v0.h[5]
         SMLAL2  v28.4s, v5.8h, v0.h[5]
         SMLAL   v25.4s, v5.4h, v1.h[5]
@@ -450,13 +450,13 @@
         SMLAL2  v22.4s, v4.8h, v2.h[6]
         SMLAL   v19.4s, v4.4h, v3.h[6]
         SMLAL2  v23.4s, v4.8h, v3.h[6]
-        LDR     x23, [x5, 112]
+        LDR     x8, [x5, 112]
         SMLAL   v24.4s, v6.4h, v0.h[6]
         SMLAL2  v28.4s, v6.8h, v0.h[6]
         SMLAL   v25.4s, v6.4h, v1.h[6]
         SMLAL2  v29.4s, v6.8h, v1.h[6]
         LDR     d5, [x5, 120]
-        INS     v4.d[0], x23
+        INS     v4.d[0], x8
         SXTL    v4.8h, v4.8b
         SMLAL   v26.4s, v6.4h, v2.h[6]
         SMLAL2  v30.4s, v6.8h, v2.h[6]
@@ -476,9 +476,10 @@
         SMLAL2  v28.4s, v5.8h, v0.h[7]
         SMLAL   v25.4s, v5.4h, v1.h[7]
         SMLAL2  v29.4s, v5.8h, v1.h[7]
-        AND     x0, x2, 7              // kc remainder 0 to 7
+        AND     x0, x2, 7               // kc remainder 0 to 7
         SMLAL   v26.4s, v5.4h, v2.h[7]
         SMLAL2  v30.4s, v5.8h, v2.h[7]
+        LDR     x8, [sp, 56]            // reload params pointer
         SMLAL   v27.4s, v5.4h, v3.h[7]
         SMLAL2  v31.4s, v5.8h, v3.h[7]
 
@@ -633,8 +634,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET
 
@@ -840,8 +841,8 @@
         ST1     {v5.b}[0], [x16]
         ST1     {v4.b}[0], [x6]
 9:
-        # Restore x20-x23 from stack
-        LDP     x22, x23, [sp, 16]
+        # Restore x20-x22 from stack
+        LDR     x22, [sp, 16]
         LDP     x20, x21, [sp], 32
         RET