Miscellaneous tweeks to QS8 IGEMM microkernels
- remainder code do ks loop
- subs between min/max instructions
- pushes weaved with C clamping
PiperOrigin-RevId: 363253758
diff --git a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
index 68ea7db..5771fc8 100644
--- a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S
@@ -22,12 +22,14 @@
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
# Register usage
-# A0 x13 v0
-# A1 x15 v1
-# B x5 v4 v5 v6 v7
-# C0 x7 v2 v10 v12 v14 v16 v18 v20 v22 v24 v26 v28 v30
-# C1 x8 v3 v11 v13 v15 v17 v19 v21 v23 v25 v27 v29 v31
-# unused v8 v9
+# A0 x13 v0
+# A1 x15 v1
+# B x5 v4 v5 v6 v7
+# C0 x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1 x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0 v2 v10 v12 v14
+# temp1 v3 v11 v13 v15
+# unused v8 v9
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
@@ -186,16 +188,15 @@
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x8], 1
LD1R {v2.16b}, [x8]
- SUB x8, x8, 11 // rewind params pointer
SMAX v0.16b, v0.16b, v1.16b
+ SUB x8, x8, 11 // rewind params pointer
SMIN v0.16b, v0.16b, v2.16b
B.LO 3f
# Store full 2 x 8
ST1 {v0.d}[1], [x7], x10
- ST1 {v0.8b}, [x6], x10
-
SUB x4, x4, x3 // a -= ks
+ ST1 {v0.8b}, [x6], x10
# nc loop
B.HI 0b
diff --git a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
index 5b8f9a3..273021b 100644
--- a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -150,11 +150,11 @@
# Is there a remainder?- 8 bytes of A
TBNZ x0, 3, 4f
-3:
# ks loop
SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*)
B.HI 1b
+3:
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
@@ -201,8 +201,8 @@
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x8], 1
LD1R {v2.16b}, [x8]
- SUB x8, x8, 11 // rewind params pointer
SMAX v0.16b, v0.16b, v1.16b
+ SUB x8, x8, 11 // rewind params pointer
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
@@ -264,6 +264,10 @@
SADALP v29.4s, v13.8h
SADALP v30.4s, v14.8h
SADALP v31.4s, v15.8h
+
+ # ks loop
+ SUBS x9, x9, 16 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
B 3b
# Store odd width
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
index a4809e7..cae3fdf 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -37,31 +37,25 @@
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
- # Load cn_stride, a_offset
- LDP x10, x11, [sp]
-
- # Load zero, params pointer
- LDP x12, x8, [sp, 16]
-
- # Save x20-x21 on stack
- STP x20, x21, [sp, -48]!
-
# Clamp C pointers
- CMP x0, 2 // if mr < 2
- STP d8, d9, [sp, 16] // Save d8-d11 on stack
- ADD x16, x6, x7 // c1 = c0 + cm_stride
- STP d10, d11, [sp, 32]
- CSEL x16, x6, x16, LO // c1 = c0
- ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ CMP x0, 2 // if mr < 2
+ LDP x10, x11, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ LDP x12, x8, [sp, 16] // Load zero, params pointer
+ CSEL x16, x6, x16, LO // c1 = c0
+ STP x20, x21, [sp, -48]! // Save x20-x21 on stack
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ STP d8, d9, [sp, 16] // Save d8-d11 on stack
- ADD x17, x16, x7 // c2 = c1 + cm_stride
- // if mr <= 2
- CSEL x17, x16, x17, LS // c2 = c1
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ STP d10, d11, [sp, 32]
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
BIC x2, x2, 3
- CMP x0, 4 // if mr < 4
- ADD x7, x17, x7 // c3 = c2 + cm_stride
- CSEL x7, x17, x7, LO // c3 = c2
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
.p2align 3
0:
@@ -430,7 +424,6 @@
.p2align 3
4:
-
# ks loop
SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
B.HI 1b
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
index d6a9fbf..24da63f 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S
@@ -145,11 +145,11 @@
# Is there a remainder?- 4 bytes of A
TBNZ x0, 2, 4f
-3:
# ks loop
SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
B.HI 1b
+3:
# Apply params - scale, shift, bias and clamp
LD2R {v0.4s, v1.4s}, [x8], 8
CMEQ v2.4s, v1.4s, 0
@@ -323,6 +323,10 @@
SDOT v29.4s, v7.16b, v1.4b[0]
SDOT v30.4s, v7.16b, v2.4b[0]
SDOT v31.4s, v7.16b, v3.4b[0]
+
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
B 3b
# Store odd width