C8 mul microkernel labels sorted and registers documented

PiperOrigin-RevId: 362365636
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
index 8fb3d70..c472364 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S
@@ -20,12 +20,14 @@
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
 # Register usage
-# A0  x3 v0
-# A1  x4 v1
-# B   x5 v4  v5  v6  v7
-# C0  x7 v2 v10 v12 v14 v16 v18 v20 v22 v24 v26 v28 v30
-# C1  x8 v3 v11 v13 v15 v17 v19 v21 v23 v25 v27 v29 v31
-# unused v8 v9
+# A0  x3  v0
+# A1  x4  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
 
@@ -152,7 +154,7 @@
         LD1R    {v2.16b}, [x9]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
-        B.LO    4f
+        B.LO    2f
 
         # Store full 2 x 8
         ST1     {v0.8b}, [x6], x10
@@ -169,22 +171,22 @@
 
         # Store odd width
         .p2align 3
-4:
-        TBZ     x1, 2, 5f
+2:
+        TBZ     x1, 2, 3f
         STR     s0, [x6], 4
         ST1     {v0.s}[2], [x7], 4
         EXT     v0.16b, v0.16b, v0.16b, 4
 
-5:
-        TBZ     x1, 1, 6f
+3:
+        TBZ     x1, 1, 4f
         ST1     {v0.h}[0], [x6], 2
         ST1     {v0.h}[4], [x7], 2
         EXT     v0.16b, v0.16b, v0.16b, 2
-6:
-        TBZ     x1, 0, 7f
+4:
+        TBZ     x1, 0, 5f
         ST1     {v0.b}[0], [x6]
         ST1     {v0.b}[8], [x7]
-7:
+5:
         # Restore d10-d15 from stack
         LDP     d14, d15, [sp, 32]
         LDP     d12, d13, [sp, 16]