4x16 QS8 IGEMM microkernels use x8 for temp
PiperOrigin-RevId: 375180818
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
index fd79cbd..fe12a92 100644
--- a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
@@ -17,7 +17,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_f32_minmax_params params [sp + 24] -> x8
+# const xnn_f32_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +33,7 @@
# C3 x7 v19 v23 v27 v31
# temp v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
@@ -49,9 +49,9 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -32]! // Save x20-x23 on stack
- STP x22, x23, [sp, 16]
+ STP x20, x21, [sp, -32]! // Save x20-x22 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
+ STR x22, [sp, 16]
CSEL x7, x17, x7, LO // c3 = c2
.p2align 3
@@ -103,7 +103,7 @@
LDR d2, [x15], 8
LDR d3, [x20], 8
SXTL v0.8h, v0.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v4.8h, v4.8b
SXTL v1.8h, v1.8b
SXTL v2.8h, v2.8b
@@ -134,7 +134,7 @@
$if PREFETCH:
PRFM PLDL1KEEP, [x20, 128]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
$if PREFETCH:
@@ -148,7 +148,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -159,7 +159,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -169,7 +169,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -180,7 +180,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -190,7 +190,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -201,7 +201,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -211,7 +211,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -222,7 +222,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -232,7 +232,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -243,7 +243,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -262,13 +262,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
ADD x5, x5, 128
@@ -284,7 +284,7 @@
SXTL v5.8h, v5.8b
SMLAL v18.4s, v4.4h, v2.h[7]
SMLAL2 v22.4s, v4.8h, v2.h[7]
- LDR x23, [x5]
+ LDR x8, [x5]
SMLAL v19.4s, v4.4h, v3.h[7]
SMLAL2 v23.4s, v4.8h, v3.h[7]
LDR x21, [x13], 8
@@ -304,13 +304,13 @@
LDR d3, [x20], 8
INS v2.d[0], x22
LDR d6, [x5, 8]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v0.8h, v0.8b
SXTL v1.8h, v1.8b
SUBS x0, x0, 8
SXTL v4.8h, v4.8b
SXTL v2.8h, v2.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v3.8h, v3.8b
SXTL v6.8h, v6.8b
B.HS 2b
@@ -328,7 +328,7 @@
SMLAL v19.4s, v4.4h, v3.h[0]
SMLAL2 v23.4s, v4.8h, v3.h[0]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
SMLAL v25.4s, v6.4h, v1.h[0]
@@ -338,7 +338,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -349,7 +349,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -359,7 +359,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -370,7 +370,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -380,7 +380,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -391,7 +391,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -401,7 +401,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -412,7 +412,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -422,7 +422,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -433,7 +433,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -452,13 +452,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
SMLAL v26.4s, v6.4h, v2.h[6]
SMLAL2 v30.4s, v6.8h, v2.h[6]
@@ -478,9 +478,10 @@
SMLAL2 v28.4s, v5.8h, v0.h[7]
SMLAL v25.4s, v5.4h, v1.h[7]
SMLAL2 v29.4s, v5.8h, v1.h[7]
- AND x0, x2, 7 // kc remainder 0 to 7
+ AND x0, x2, 7 // kc remainder 0 to 7
SMLAL v26.4s, v5.4h, v2.h[7]
SMLAL2 v30.4s, v5.8h, v2.h[7]
+ LDR x8, [sp, 56] // reload params pointer
SMLAL v27.4s, v5.4h, v3.h[7]
SMLAL2 v31.4s, v5.8h, v3.h[7]
@@ -635,8 +636,8 @@
# nc loop
B.HI 0b
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET
@@ -842,8 +843,8 @@
ST1 {v5.b}[0], [x16]
ST1 {v4.b}[0], [x6]
9:
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
index 746e5d0..0774357 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S
@@ -17,7 +17,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_f32_minmax_params params [sp + 24] -> x8
+# const xnn_f32_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +33,7 @@
# C3 x7 v19 v23 v27 v31
# unused v12 v13 v14 v15
-# x21 temp for Cortex-A55 loads
+# x8 temp for Cortex-A55 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
@@ -43,7 +43,7 @@
ADD x16, x6, x7 // c1 = c0 + cm_stride
LDP x12, x8, [sp, 16] // Load zero, params pointer
CSEL x16, x6, x16, LO // c1 = c0
- STP x20, x21, [sp, -48]! // Save x20-x21 on stack
+ STR x20, [sp, -48]! // Save x20 on stack
ADD x2, x2, 3 // kc = (kc + 3) & ~3
STP d8, d9, [sp, 16] // Save d8-d11 on stack
@@ -107,7 +107,7 @@
LDR d3, [x20], 8
SUBS x0, x0, 16 // is there 16 for main loop?
LDR d9, [x5], 8
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
# Is there at least 16 bytes for main loop?
B.LO 3f
@@ -123,9 +123,9 @@
SDOT v16.4s, v8.16b, v0.4b[0]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v1.4b[0]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v3.4b[0]
LDR d4, [x13], 8
@@ -133,9 +133,9 @@
SDOT v20.4s, v9.16b, v0.4b[0]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v1.4b[0]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v3.4b[0]
LDR d5, [x14], 8
@@ -143,9 +143,9 @@
SDOT v24.4s, v10.16b, v0.4b[0]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v1.4b[0]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v3.4b[0]
LDR d6, [x15], 8
@@ -153,9 +153,9 @@
SDOT v28.4s, v11.16b, v0.4b[0]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v1.4b[0]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v3.4b[0]
LDR d7, [x20], 8
@@ -163,45 +163,45 @@
SDOT v16.4s, v8.16b, v0.4b[1]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v1.4b[1]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v3.4b[1]
// BLOCK 1
SDOT v20.4s, v9.16b, v0.4b[1]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v1.4b[1]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v3.4b[1]
// BLOCK 2
SDOT v24.4s, v10.16b, v0.4b[1]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v1.4b[1]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v3.4b[1]
// BLOCK 4
SDOT v28.4s, v11.16b, v0.4b[1]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v1.4b[1]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v3.4b[1]
// BLOCK 0
SDOT v16.4s, v8.16b, v4.4b[0]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v5.4b[0]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v7.4b[0]
LDR d0, [x13], 8
@@ -209,9 +209,9 @@
SDOT v20.4s, v9.16b, v4.4b[0]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v5.4b[0]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v7.4b[0]
LDR d1, [x14], 8
@@ -219,9 +219,9 @@
SDOT v24.4s, v10.16b, v4.4b[0]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v5.4b[0]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v7.4b[0]
LDR d2, [x15], 8
@@ -229,9 +229,9 @@
SDOT v28.4s, v11.16b, v4.4b[0]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v5.4b[0]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v7.4b[0]
LDR d3, [x20], 8
@@ -239,27 +239,27 @@
SDOT v16.4s, v8.16b, v4.4b[1]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v5.4b[1]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v7.4b[1]
// BLOCK 1
SDOT v20.4s, v9.16b, v4.4b[1]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v5.4b[1]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v7.4b[1]
// BLOCK 2
SDOT v24.4s, v10.16b, v4.4b[1]
LDR d8, [x5], 8 // First B values for block 0 and 1
SDOT v25.4s, v10.16b, v5.4b[1]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v7.4b[1]
SUBS x0, x0, 16
@@ -267,9 +267,9 @@
SDOT v28.4s, v11.16b, v4.4b[1]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v5.4b[1]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v7.4b[1]
B.HS 2b
@@ -279,9 +279,9 @@
SDOT v16.4s, v8.16b, v0.4b[0]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v1.4b[0]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v3.4b[0]
LDR d4, [x13], 8
@@ -289,9 +289,9 @@
SDOT v20.4s, v9.16b, v0.4b[0]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v1.4b[0]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v3.4b[0]
LDR d5, [x14], 8
@@ -299,9 +299,9 @@
SDOT v24.4s, v10.16b, v0.4b[0]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v1.4b[0]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v3.4b[0]
LDR d6, [x15], 8
@@ -309,9 +309,9 @@
SDOT v28.4s, v11.16b, v0.4b[0]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v1.4b[0]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v2.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v3.4b[0]
LDR d7, [x20], 8
@@ -319,96 +319,96 @@
SDOT v16.4s, v8.16b, v0.4b[1]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v1.4b[1]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v3.4b[1]
// BLOCK 1
SDOT v20.4s, v9.16b, v0.4b[1]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v1.4b[1]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v3.4b[1]
// BLOCK 2
SDOT v24.4s, v10.16b, v0.4b[1]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v1.4b[1]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v3.4b[1]
// BLOCK 4
SDOT v28.4s, v11.16b, v0.4b[1]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v1.4b[1]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v2.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v3.4b[1]
// BLOCK 0
SDOT v16.4s, v8.16b, v4.4b[0]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v5.4b[0]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v7.4b[0]
// BLOCK 1
SDOT v20.4s, v9.16b, v4.4b[0]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v5.4b[0]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v7.4b[0]
// BLOCK 2
SDOT v24.4s, v10.16b, v4.4b[0]
LDR d8, [x5], 8
SDOT v25.4s, v10.16b, v5.4b[0]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v27.4s, v10.16b, v7.4b[0]
// BLOCK 3
SDOT v28.4s, v11.16b, v4.4b[0]
LDR d9, [x5], 8
SDOT v29.4s, v11.16b, v5.4b[0]
- INS v8.d[1], x21
+ INS v8.d[1], x8
SDOT v30.4s, v11.16b, v6.4b[0]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v31.4s, v11.16b, v7.4b[0]
// BLOCK 0
SDOT v16.4s, v8.16b, v4.4b[1]
LDR d10, [x5], 8
SDOT v17.4s, v8.16b, v5.4b[1]
- INS v9.d[1], x21
+ INS v9.d[1], x8
SDOT v18.4s, v8.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v19.4s, v8.16b, v7.4b[1]
// BLOCK 1
SDOT v20.4s, v9.16b, v4.4b[1]
LDR d11, [x5], 8
SDOT v21.4s, v9.16b, v5.4b[1]
- INS v10.d[1], x21
+ INS v10.d[1], x8
SDOT v22.4s, v9.16b, v6.4b[1]
- LDR x21, [x5], 8
+ LDR x8, [x5], 8
SDOT v23.4s, v9.16b, v7.4b[1]
// BLOCK 2
SDOT v24.4s, v10.16b, v4.4b[1]
SDOT v25.4s, v10.16b, v5.4b[1]
- INS v11.d[1], x21
+ INS v11.d[1], x8
SDOT v26.4s, v10.16b, v6.4b[1]
SDOT v27.4s, v10.16b, v7.4b[1]
AND x0, x2, 15 // kc remainder 0 to 12
@@ -416,6 +416,7 @@
// BLOCK 3
SDOT v28.4s, v11.16b, v4.4b[1]
SDOT v29.4s, v11.16b, v5.4b[1]
+ LDR x8, [sp, 72] // reload params pointer
SDOT v30.4s, v11.16b, v6.4b[1]
SDOT v31.4s, v11.16b, v7.4b[1]
@@ -570,8 +571,8 @@
LDP d10, d11, [sp, 32]
LDP d8, d9, [sp, 16]
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 48
+ # Restore x20 from stack
+ LDR x20, [sp], 48
RET
# Remainder- 4 to 12 bytes of A
@@ -687,8 +688,8 @@
LDP d10, d11, [sp, 32]
LDP d8, d9, [sp, 16]
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 48
+ # Restore x20 from stack
+ LDR x20, [sp], 48
RET
END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
diff --git a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
index 9084aea..7d502ae 100644
--- a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S
@@ -21,7 +21,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_f32_minmax_params params [sp + 24] -> x8
+# const xnn_f32_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -37,7 +37,7 @@
# C3 x7 v19 v23 v27 v31
# temp v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
@@ -53,9 +53,9 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -32]! // Save x20-x23 on stack
- STP x22, x23, [sp, 16]
+ STP x20, x21, [sp, -32]! // Save x20-x22 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
+ STR x22, [sp, 16]
CSEL x7, x17, x7, LO // c3 = c2
.p2align 3
@@ -107,7 +107,7 @@
LDR d2, [x15], 8
LDR d3, [x20], 8
SXTL v0.8h, v0.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v4.8h, v4.8b
SXTL v1.8h, v1.8b
SXTL v2.8h, v2.8b
@@ -130,7 +130,7 @@
SMLAL v19.4s, v4.4h, v3.h[0]
SMLAL2 v23.4s, v4.8h, v3.h[0]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
SMLAL v25.4s, v6.4h, v1.h[0]
@@ -140,7 +140,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -151,7 +151,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -161,7 +161,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -172,7 +172,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -182,7 +182,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -193,7 +193,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -203,7 +203,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -214,7 +214,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -224,7 +224,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -235,7 +235,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -254,13 +254,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
ADD x5, x5, 128
@@ -276,7 +276,7 @@
SXTL v5.8h, v5.8b
SMLAL v18.4s, v4.4h, v2.h[7]
SMLAL2 v22.4s, v4.8h, v2.h[7]
- LDR x23, [x5]
+ LDR x8, [x5]
SMLAL v19.4s, v4.4h, v3.h[7]
SMLAL2 v23.4s, v4.8h, v3.h[7]
LDR x21, [x13], 8
@@ -296,13 +296,13 @@
LDR d3, [x20], 8
INS v2.d[0], x22
LDR d6, [x5, 8]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v0.8h, v0.8b
SXTL v1.8h, v1.8b
SUBS x0, x0, 8
SXTL v4.8h, v4.8b
SXTL v2.8h, v2.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v3.8h, v3.8b
SXTL v6.8h, v6.8b
B.HS 2b
@@ -320,7 +320,7 @@
SMLAL v19.4s, v4.4h, v3.h[0]
SMLAL2 v23.4s, v4.8h, v3.h[0]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
SMLAL v25.4s, v6.4h, v1.h[0]
@@ -330,7 +330,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -341,7 +341,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -351,7 +351,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -362,7 +362,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -372,7 +372,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -383,7 +383,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -393,7 +393,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -404,7 +404,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -414,7 +414,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -425,7 +425,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -444,13 +444,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
SMLAL v26.4s, v6.4h, v2.h[6]
SMLAL2 v30.4s, v6.8h, v2.h[6]
@@ -470,9 +470,10 @@
SMLAL2 v28.4s, v5.8h, v0.h[7]
SMLAL v25.4s, v5.4h, v1.h[7]
SMLAL2 v29.4s, v5.8h, v1.h[7]
- AND x0, x2, 7 // kc remainder 0 to 7
+ AND x0, x2, 7 // kc remainder 0 to 7
SMLAL v26.4s, v5.4h, v2.h[7]
SMLAL2 v30.4s, v5.8h, v2.h[7]
+ LDR x8, [sp, 56] // reload params pointer
SMLAL v27.4s, v5.4h, v3.h[7]
SMLAL2 v31.4s, v5.8h, v3.h[7]
@@ -627,8 +628,8 @@
# nc loop
B.HI 0b
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET
@@ -834,8 +835,8 @@
ST1 {v5.b}[0], [x16]
ST1 {v4.b}[0], [x6]
9:
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET
diff --git a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
index 65e9a61..a235a7f 100644
--- a/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -21,7 +21,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_f32_minmax_params params [sp + 24] -> x8
+# const xnn_f32_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -37,7 +37,7 @@
# C3 x7 v19 v23 v27 v31
# temp v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x21, x22, x23 temp for Cortex-A53 loads
+# x8, x21, x22 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
@@ -53,9 +53,9 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -32]! // Save x20-x23 on stack
- STP x22, x23, [sp, 16]
+ STP x20, x21, [sp, -32]! // Save x20-x22 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
+ STR x22, [sp, 16]
CSEL x7, x17, x7, LO // c3 = c2
.p2align 3
@@ -107,7 +107,7 @@
LDR d2, [x15], 8
LDR d3, [x20], 8
SXTL v0.8h, v0.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v4.8h, v4.8b
SXTL v1.8h, v1.8b
SXTL v2.8h, v2.8b
@@ -134,7 +134,7 @@
SMLAL2 v23.4s, v4.8h, v3.h[0]
PRFM PLDL1KEEP, [x20, 128]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
PRFM PLDL1KEEP, [x5, 448]
@@ -146,7 +146,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -157,7 +157,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -167,7 +167,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -178,7 +178,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -188,7 +188,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -199,7 +199,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -209,7 +209,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -220,7 +220,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -230,7 +230,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -241,7 +241,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -260,13 +260,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
ADD x5, x5, 128
@@ -282,7 +282,7 @@
SXTL v5.8h, v5.8b
SMLAL v18.4s, v4.4h, v2.h[7]
SMLAL2 v22.4s, v4.8h, v2.h[7]
- LDR x23, [x5]
+ LDR x8, [x5]
SMLAL v19.4s, v4.4h, v3.h[7]
SMLAL2 v23.4s, v4.8h, v3.h[7]
LDR x21, [x13], 8
@@ -302,13 +302,13 @@
LDR d3, [x20], 8
INS v2.d[0], x22
LDR d6, [x5, 8]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v0.8h, v0.8b
SXTL v1.8h, v1.8b
SUBS x0, x0, 8
SXTL v4.8h, v4.8b
SXTL v2.8h, v2.8b
- LDR x23, [x5, 16]
+ LDR x8, [x5, 16]
SXTL v3.8h, v3.8b
SXTL v6.8h, v6.8b
B.HS 2b
@@ -326,7 +326,7 @@
SMLAL v19.4s, v4.4h, v3.h[0]
SMLAL2 v23.4s, v4.8h, v3.h[0]
LDR d4, [x5, 24]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[0]
SMLAL2 v28.4s, v6.8h, v0.h[0]
SMLAL v25.4s, v6.4h, v1.h[0]
@@ -336,7 +336,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[0]
SMLAL v27.4s, v6.4h, v3.h[0]
SMLAL2 v31.4s, v6.8h, v3.h[0]
- LDR x23, [x5, 32]
+ LDR x8, [x5, 32]
SMLAL v16.4s, v5.4h, v0.h[1]
SMLAL2 v20.4s, v5.8h, v0.h[1]
SMLAL v17.4s, v5.4h, v1.h[1]
@@ -347,7 +347,7 @@
SMLAL v19.4s, v5.4h, v3.h[1]
SMLAL2 v23.4s, v5.8h, v3.h[1]
LDR d5, [x5, 40]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[1]
SMLAL2 v28.4s, v4.8h, v0.h[1]
SMLAL v25.4s, v4.4h, v1.h[1]
@@ -357,7 +357,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[1]
SMLAL v27.4s, v4.4h, v3.h[1]
SMLAL2 v31.4s, v4.8h, v3.h[1]
- LDR x23, [x5, 48]
+ LDR x8, [x5, 48]
SMLAL v16.4s, v6.4h, v0.h[2]
SMLAL2 v20.4s, v6.8h, v0.h[2]
SMLAL v17.4s, v6.4h, v1.h[2]
@@ -368,7 +368,7 @@
SMLAL v19.4s, v6.4h, v3.h[2]
SMLAL2 v23.4s, v6.8h, v3.h[2]
LDR d6, [x5, 56]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[2]
SMLAL2 v28.4s, v5.8h, v0.h[2]
SMLAL v25.4s, v5.4h, v1.h[2]
@@ -378,7 +378,7 @@
SMLAL2 v30.4s, v5.8h, v2.h[2]
SMLAL v27.4s, v5.4h, v3.h[2]
SMLAL2 v31.4s, v5.8h, v3.h[2]
- LDR x23, [x5, 64]
+ LDR x8, [x5, 64]
SMLAL v16.4s, v4.4h, v0.h[3]
SMLAL2 v20.4s, v4.8h, v0.h[3]
SMLAL v17.4s, v4.4h, v1.h[3]
@@ -389,7 +389,7 @@
SMLAL v19.4s, v4.4h, v3.h[3]
SMLAL2 v23.4s, v4.8h, v3.h[3]
LDR d4, [x5, 72]
- INS v5.d[0], x23
+ INS v5.d[0], x8
SMLAL v24.4s, v6.4h, v0.h[3]
SMLAL2 v28.4s, v6.8h, v0.h[3]
SXTL v5.8h, v5.8b
@@ -399,7 +399,7 @@
SMLAL2 v30.4s, v6.8h, v2.h[3]
SMLAL v27.4s, v6.4h, v3.h[3]
SMLAL2 v31.4s, v6.8h, v3.h[3]
- LDR x23, [x5, 80]
+ LDR x8, [x5, 80]
SMLAL v16.4s, v5.4h, v0.h[4]
SMLAL2 v20.4s, v5.8h, v0.h[4]
SMLAL v17.4s, v5.4h, v1.h[4]
@@ -410,7 +410,7 @@
SMLAL v19.4s, v5.4h, v3.h[4]
SMLAL2 v23.4s, v5.8h, v3.h[4]
LDR d5, [x5, 88]
- INS v6.d[0], x23
+ INS v6.d[0], x8
SMLAL v24.4s, v4.4h, v0.h[4]
SMLAL2 v28.4s, v4.8h, v0.h[4]
SMLAL v25.4s, v4.4h, v1.h[4]
@@ -420,7 +420,7 @@
SMLAL2 v30.4s, v4.8h, v2.h[4]
SMLAL v27.4s, v4.4h, v3.h[4]
SMLAL2 v31.4s, v4.8h, v3.h[4]
- LDR x23, [x5, 96]
+ LDR x8, [x5, 96]
SMLAL v16.4s, v6.4h, v0.h[5]
SMLAL2 v20.4s, v6.8h, v0.h[5]
SMLAL v17.4s, v6.4h, v1.h[5]
@@ -431,7 +431,7 @@
SMLAL v19.4s, v6.4h, v3.h[5]
SMLAL2 v23.4s, v6.8h, v3.h[5]
LDR d6, [x5, 104]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SMLAL v24.4s, v5.4h, v0.h[5]
SMLAL2 v28.4s, v5.8h, v0.h[5]
SMLAL v25.4s, v5.4h, v1.h[5]
@@ -450,13 +450,13 @@
SMLAL2 v22.4s, v4.8h, v2.h[6]
SMLAL v19.4s, v4.4h, v3.h[6]
SMLAL2 v23.4s, v4.8h, v3.h[6]
- LDR x23, [x5, 112]
+ LDR x8, [x5, 112]
SMLAL v24.4s, v6.4h, v0.h[6]
SMLAL2 v28.4s, v6.8h, v0.h[6]
SMLAL v25.4s, v6.4h, v1.h[6]
SMLAL2 v29.4s, v6.8h, v1.h[6]
LDR d5, [x5, 120]
- INS v4.d[0], x23
+ INS v4.d[0], x8
SXTL v4.8h, v4.8b
SMLAL v26.4s, v6.4h, v2.h[6]
SMLAL2 v30.4s, v6.8h, v2.h[6]
@@ -476,9 +476,10 @@
SMLAL2 v28.4s, v5.8h, v0.h[7]
SMLAL v25.4s, v5.4h, v1.h[7]
SMLAL2 v29.4s, v5.8h, v1.h[7]
- AND x0, x2, 7 // kc remainder 0 to 7
+ AND x0, x2, 7 // kc remainder 0 to 7
SMLAL v26.4s, v5.4h, v2.h[7]
SMLAL2 v30.4s, v5.8h, v2.h[7]
+ LDR x8, [sp, 56] // reload params pointer
SMLAL v27.4s, v5.4h, v3.h[7]
SMLAL2 v31.4s, v5.8h, v3.h[7]
@@ -633,8 +634,8 @@
# nc loop
B.HI 0b
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET
@@ -840,8 +841,8 @@
ST1 {v5.b}[0], [x16]
ST1 {v4.b}[0], [x6]
9:
- # Restore x20-x23 from stack
- LDP x22, x23, [sp, 16]
+ # Restore x20-x22 from stack
+ LDR x22, [sp, 16]
LDP x20, x21, [sp], 32
RET