Miscellaneous tweeks to QS8 IGEMM microkernels
- remainder code do ks loop
- subs between min/max instructions
- pushes weaved with C clamping

PiperOrigin-RevId: 363253758
diff --git a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
index 68ea7db..5771fc8 100644
--- a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
@@ -22,12 +22,14 @@
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
 # Register usage
-# A0 x13 v0
-# A1 x15 v1
-# B   x5 v4  v5  v6  v7
-# C0  x7 v2 v10 v12 v14 v16 v18 v20 v22 v24 v26 v28 v30
-# C1  x8 v3 v11 v13 v15 v17 v19 v21 v23 v25 v27 v29 v31
-# unused v8 v9
+# A0 x13  v0
+# A1 x15  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
 
@@ -186,16 +188,15 @@
         SQXTN2  v0.16b, v1.8h
         LD1R    {v1.16b}, [x8], 1
         LD1R    {v2.16b}, [x8]
-        SUB     x8, x8, 11       // rewind params pointer
         SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x8, x8, 11       // rewind params pointer
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    3f
 
         # Store full 2 x 8
         ST1     {v0.d}[1], [x7], x10
-        ST1     {v0.8b}, [x6], x10
-
         SUB     x4, x4, x3  // a -= ks
+        ST1     {v0.8b}, [x6], x10
 
         # nc loop
         B.HI    0b
diff --git a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
index 5b8f9a3..273021b 100644
--- a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -150,11 +150,11 @@
         # Is there a remainder?- 8 bytes of A
         TBNZ    x0, 3, 4f
 
-3:
         # ks loop
         SUBS    x9, x9, 16  // ks -= MR * sizeof(int8_t*)
         B.HI    1b
 
+3:
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
@@ -201,8 +201,8 @@
         SQXTN2  v0.16b, v1.8h
         LD1R    {v1.16b}, [x8], 1
         LD1R    {v2.16b}, [x8]
-        SUB     x8, x8, 11       // rewind params pointer
         SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x8, x8, 11       // rewind params pointer
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
 
@@ -264,6 +264,10 @@
         SADALP  v29.4s, v13.8h
         SADALP  v30.4s, v14.8h
         SADALP  v31.4s, v15.8h
+
+        # ks loop
+        SUBS    x9, x9, 16  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
         B       3b
 
         # Store odd width
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
index a4809e7..cae3fdf 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -37,31 +37,25 @@
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
-        # Load cn_stride, a_offset
-        LDP     x10, x11, [sp]
-
-        # Load zero, params pointer
-        LDP     x12, x8, [sp, 16]
-
-        # Save x20-x21 on stack
-        STP     x20, x21, [sp, -48]!
-
         # Clamp C pointers
-        CMP      x0, 2              // if mr < 2
-        STP      d8,  d9, [sp, 16]  // Save d8-d11 on stack
-        ADD     x16, x6, x7         // c1 = c0 + cm_stride
-        STP     d10, d11, [sp, 32]
-        CSEL    x16, x6,  x16, LO   //   c1 = c0
-        ADD      x2, x2, 3          // kc = (kc + 3) & ~3
+        CMP      x0, 2                // if mr < 2
+        LDP     x10, x11, [sp]        // Load cn_stride, a_offset
+        ADD     x16, x6, x7           // c1 = c0 + cm_stride
+        LDP     x12, x8, [sp, 16]     // Load zero, params pointer
+        CSEL    x16, x6,  x16, LO     //   c1 = c0
+        STP     x20, x21, [sp, -48]!  // Save x20-x21 on stack
+        ADD      x2, x2, 3            // kc = (kc + 3) & ~3
+        STP      d8,  d9, [sp, 16]    // Save d8-d11 on stack
 
-        ADD     x17, x16, x7        // c2 = c1 + cm_stride
-                                    // if mr <= 2
-        CSEL    x17, x16, x17, LS   //   c2 = c1
+        ADD     x17, x16, x7          // c2 = c1 + cm_stride
+        STP     d10, d11, [sp, 32]
+                                      // if mr <= 2
+        CSEL    x17, x16, x17, LS     //   c2 = c1
         BIC      x2, x2, 3
 
-        CMP      x0, 4              // if mr < 4
-        ADD      x7,  x17, x7       // c3 = c2 + cm_stride
-        CSEL     x7,  x17, x7, LO   //   c3 = c2
+        CMP      x0, 4                // if mr < 4
+        ADD      x7,  x17, x7         // c3 = c2 + cm_stride
+        CSEL     x7,  x17, x7, LO     //   c3 = c2
 
         .p2align 3
 0:
@@ -430,7 +424,6 @@
 
         .p2align 3
 4:
-
         # ks loop
         SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
         B.HI    1b
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
index d6a9fbf..24da63f 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
@@ -145,11 +145,11 @@
         # Is there a remainder?- 4 bytes of A
         TBNZ    x0, 2, 4f
 
-3:
         # ks loop
         SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
         B.HI    1b
 
+3:
         # Apply params - scale, shift, bias and clamp
         LD2R    {v0.4s, v1.4s}, [x8], 8
         CMEQ    v2.4s, v1.4s, 0
@@ -323,6 +323,10 @@
         SDOT    v29.4s, v7.16b, v1.4b[0]
         SDOT    v30.4s, v7.16b, v2.4b[0]
         SDOT    v31.4s, v7.16b, v3.4b[0]
+
+        # ks loop
+        SUBS    x9, x9, 32  // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
         B       3b
 
         # Store odd width