arm64: itx: Minor optimizations for the 8x32 functions
This gives a couple cycles speedup.
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index 0c91379..4be4c8d 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -2317,6 +2317,7 @@
mov w8, #2*\h
1:
+ ldrh w12, [x13], #2
.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
ld1 {\i}, [x2]
st1 {v0.8h}, [x2], x8
@@ -2329,14 +2330,13 @@
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ cmp w3, w12
.if \w == 8
load_add_store_8x8 x0, x7, shiftbits=2
.else
load_add_store_8x8 x0, x7, shiftbits=3
.endif
- ldrh w12, [x13], #2
- cmp w3, w12
b.lt 9f
.if \w == 8
sub x2, x2, x8, lsl #3
@@ -2509,16 +2509,15 @@
mov x8, #2*32
mov w9, #32
mov x6, sp
- mov x7, x2
1:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x7]
- st1 {v28.8h}, [x7], x8
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
.endr
ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
sub w9, w9, #8
- sub x7, x7, x8, lsl #3
- add x7, x7, #2*8
+ add x2, x2, #2*8
bl inv_dct_8x8_neon
@@ -2528,10 +2527,9 @@
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
cmp w3, w12
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- st1 {v\i\().8h}, [x6], #16
-.endr
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
b.ge 1b
cbz w9, 3f