blob: 8ddce48529c77e4f7ba9220d579426e283fb008d [file] [log] [blame]
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
//st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]!
//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X18, X19, [sp, #-16]!
stp X20, X21, [sp, #-16]!
stp X22, X23, [sp, #-16]!
stp X24, X25, [sp, #-16]!
stp X26, X27, [sp, #-16]!
stp X28, X29, [sp, #-16]!
stp X30, X29, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X30, X29, [sp], #16
ldp X28, X29, [sp], #16
ldp X26, X27, [sp], #16
ldp X24, X25, [sp], #16
ldp X22, X23, [sp], #16
ldp X20, X21, [sp], #16
ldp X18, X19, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
//ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.p2align 2
.global ixheaacd_post_twid_overlap_add_armv8
ixheaacd_post_twid_overlap_add_armv8:
// STMFD sp!, {x4-x12}
push_v_regs
//stp x19, x20,[sp,#-16]!
//VPUSH {d8 - d15}
//LDR w4, [sp, #100]
//sxtw x4,w4
//LDR w5, [sp, #104]
//sxtw x5,w5
//LDR w6, [sp, #108]
//sxtw x6,w6
MOV x16, x5
MOV x17, x7
LSL x9, x3, #2
ASR x9, x9, #1
ADD x6, x6, x9
SUB x6, x6, #4
LDR w8, =7500
sxtw x8, w8
ADD x2, x2, x8
movi v18.4h, #50
sub x20, x5, #15
neg x9, x20
movi v20.4s, #0x80, LSL #8
dup v16.4s, w5
SUB x5, x5, #16
//STR w5, [sp, #116]
MOV w25, w5
sxtw x25, w25
MOV x8, #1
LSL x8, x8, x9
//STR w8, [sp, #120]
MOV w26, w8
//sxtw x8,w8
ARM_PROLOGUE:
LDR w8, [x1], #4
sxtw x8, w8
LDR w9, [x1], #4
sxtw x9, w9
LDR w10, [x2], #4
sxtw x10, w10
AND w19, w10, 0xFFFF
sxth x19, w19
ASR w10, w10, #16
// SMULWT x11, x8, x10
//
// SMULWB x12, x9, x10
// SMULWB x5, x8, x10
// SMLAWT x7, x9, x10, x5
SMULL x11, w8, w10
ASR x11, x11, #16
SMULL x12, w9, w19
ASR x12, x12, #16
SMULL x5, w8, w19
ASR x5, x5, #16
SMULL x7, w9, w10
ASR x7, x7, #16
ADD x7, x7, x5
SUB x8, x12, x11
MVN x5, x7
ADD x5, x5, #1
MOV x9, #50
MOV x12, #-50
AND w19, w9, 0xFFFF
sxth x19, w19
SMULL x10, w5, w19
ASR x10, x10, #16
AND w19, w12, 0xFFFF
sxth x19, w19
SMULL x11, w8, w19
ASR x11, x11, #16
ADD x8, x8, x10
ADD x5, x5, x11
//LDR w11, [sp, #104]
MOV w11, w16
sxth x11, w11
LDR w10, [x6], #-32
sxtw x10, w10
AND w19, w10, 0xFFFF
sxth x19, w19
ASR w20, w10, #16
//SMULWB x7, x8, x10
SMULL x7, w8, w19
ASR x7, x7, #16
MVN x8, x8
ADD x8, x8, #1
//SMULWT x12, x8, x10
SMULL x12, w8, w20
ASR x12, x12, #16
CMP x11, #0
BLT NEXT
SUB x9, x11, #16
negs x9, x9
// LDR w8, [sp, #120]
//sxtw x8,w8
MOV v1.s[0], w26
MOV v2.s[0], w5
//sQADD w5, w5, w8
//ASR w5, w5, w9
SQADD v2.2s, v2.2s, v1.2s
MOV w5, v2.s[0]
ASR w5, w5, w9
SUB x9, x11, #31
negs x9, x9
ASR x20, x7, x9
//MOV x8, x20
ADDS x8, x20, #0
BGE NEXT2
CMN x8, #1
NEXT2:
LDR x20, =0x80000000
csel x7, x20, x7, LT
LDR x20, =0x7fffffff
csel x7, x20, x7, GT
LSL x20, x7, x11
csel x7, x20, x7, EQ
SUB x9, x11, #31
negs x9, x9
ASR x20, x12, x9
//MOV x8, x20
ADDS x8, x20, #0
BGE NEXT3
CMN x8, #1
NEXT3:
LDR x20, =0x80000000
csel x12, x20, x12, LT
LDR x20, =0x7fffffff
csel x12, x20, x12, GT
LSL x20, x12, x11
csel x12, x20, x12, EQ
B NEXT1
NEXT:
MVN w11, w11
ADD w11, w11, #1
ASR w5, w5, w11
MOV w8, #0x8000
MOV v1.s[0], w8
MOV v2.s[0], w5
//QADD x5, x5, x8
SQADD v2.2s, v2.2s, v1.2s
MOV w5, v2.s[0]
ASR w5, w5, #16
ASR w7, w7, w11
ASR w12, w12, w11
NEXT1:
LDR w9, [x4]
sxtw x9, w9
MOV w8, #0x8000
//sxtw x8,w8
STR w5, [x4], #4
sxtw x5, w5
ROR w20, w10, #16
//UXTH x5, x10, ROR #16
UXTH w5, w20
UXTH w10, w10
dup v0.2s, w9
dup v2.2s, w10
dup v3.2s, w5
//VZIP.32 D2, D3
ZIP1 v28.2s, v2.2s, v3.2s
ZIP2 v3.2s, v2.2s, v3.2s
MOV v2.8b, v28.8b
sMULL v0.2d, v2.2s, v0.2s
Sqxtn v8.2s, v0.2d
dup v0.2s, w12
dup v1.2s, w7
//VZIP.32 D0, D1
ZIP1 v28.2s, v0.2s, v1.2s
ZIP2 v1.2s, v0.2s, v1.2s
MOV v0.8b, v28.8b
SQSUB v8.2s, v0.2s , v8.2s
sQshL v8.2s, v8.2s, #2
dup v0.2s, w8
SQADD v8.2s, v8.2s , v0.2s
sshR v8.2s, v8.2s, #16
MOV x7, x17
//sxtw x7,w7
LSL x10, x7, #1
ASR x5, x3, #1
//SMULBB x5, x10, x5
AND w5, w5, 0xFFFF
sxth x5, w5
AND w19, w10, 0xFFFF
sxth x19, w19
SMULL x5, w19, w5
ADD x5, x5, x0
SUB x0, x5, x10
MVN x9, x10
ADD x9, x9, #1
ST1 {v8.h}[2], [x0], x9
ST1 {v8.h}[0], [x5], x10
MOV x8, x1
LSL x12, x3, #2
ADD x1, x1, x12
SUB x1, x1, #40
MOV x12, #-32
PROLOGUE_NEON:
ASR x3, x3, #2
SUB x3, x3, #4
ASR x3, x3, #2
SUB x3, x3, #2
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD x1, x1, x12
//VUZP.16 D0, D1
UZP1 v28.8h, v0.8h, v0.8h
UZP2 v29.8h, v0.8h, v0.8h
MOV v0.d[0], v28.d[0]
MOV v0.d[1], v29.d[0]
//VUZP.16 D2, D3
UZP1 v28.8h, v2.8h, v2.8h
UZP2 v29.8h, v2.8h, v2.8h
MOV v2.d[0], v28.d[0]
MOV v2.d[1], v29.d[0]
//rev64 v0.8h, v0.8h
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
LD2 { v4.4s, v5.4s}, [x8]
MOV v6.16b, v5.16b
ADD x8, x8, #32
uMULL v30.4s, v0.4h, v9.4h
// VUZP.16 D4, D5
UZP1 v28.8h, v4.8h, v4.8h
UZP2 v29.8h, v4.8h, v4.8h
MOV v4.d[0], v28.d[0]
MOV v5.d[0], v29.d[0]
uMULL v28.4s, v2.4h, v8.4h
// VUZP.16 D6, D7
UZP1 v26.8h, v6.8h, v6.8h
UZP2 v27.8h, v6.8h, v6.8h
MOV v6.d[0], v26.d[0]
MOV v7.d[0], v27.d[0]
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
LD2 { v10.4s, v11.4s}, [x6]
MOV v12.16b, v11.16b
ADD x6, x6, x12
ushR v30.4s, v30.4s, #16
//VUZP.16 D10, D11
UZP1 v22.8h, v10.8h, v10.8h
UZP2 v23.8h, v10.8h, v10.8h
MOV v10.d[0], v22.d[0]
MOV v10.d[1], v23.d[0]
ushR v28.4s, v28.4s, #16
//VUZP.16 D12, D13
UZP1 v22.8h, v12.8h, v12.8h
UZP2 v23.8h, v12.8h, v12.8h
MOV v12.d[0], v22.d[0]
MOV v12.d[1], v23.d[0]
sMLAL v30.4s, v1.4h, v9.4h
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
sMLAL v28.4s, v3.4h, v8.4h
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
uMULL v22.4s, v4.4h, v8.4h
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
mov v24.16b, v28.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s, #16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s, #16
ushR v0.4s, v0.4s, #16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s, #16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s, #16
sshR v0.4s, v0.4s, #16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s, w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
MOV w11, w25
//sxtw x11,w11
dup v0.4s, w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s, #16
ushR v26.4s, v26.4s, #16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s, #16
ushR v6.4s, v6.4s, #16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s, w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s, w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4], #16
ST1 { v0.4s}, [x4], #16
movi v1.2s, #0
//VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[0]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD x1, x1, x12
// VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
ushR v8.4s, v8.4s, #16
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
ushR v30.4s, v30.4s, #16
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
LD2 { v10.4s, v11.4s}, [x6]
ADD x6, x6, x12
MOV v12.16b, v11.16b
sQshL v4.4s, v4.4s, #2
// VUZP.16 D10, D11
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
sQshL v6.4s, v6.4s, #2
// VUZP.16 D12, D13
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
SQADD v14.4s, v4.4s , v20.4s
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
SQADD v6.4s, v6.4s , v20.4s
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
sshR v14.4s, v14.4s, #16
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
sshR v6.4s, v6.4s, #16
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
mov v15.8b, v6.8b
sQshL v8.4s, v8.4s, v16.4s
LD2 { v4.4s, v5.4s}, [x8]
ADD x8, x8, #32
MOV v6.16b, v5.16b
sQshL v30.4s, v30.4s, v16.4s
// VUZP.16 D4, D5
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
SQSUB v8.4s, v8.4s , v24.4s
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
SQSUB v22.4s, v30.4s , v22.4s
sQshL v30.4s, v8.4s, #2
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
sQshL v22.4s, v22.4s, #2
SQADD v30.4s, v30.4s , v20.4s
SQADD v22.4s, v22.4s , v20.4s
sshR v30.4s, v30.4s, #16
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
sshR v22.4s, v22.4s, #16
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
mov v23.8b, v30.8b
CORE_LOOP:
ST1 {v14.h}[0], [x0]
ADD x0, x0, x9
uMULL v30.4s, v0.4h, v9.4h
ST1 {v22.h}[0], [x0]
ADD x0, x0, x9
uMULL v28.4s, v2.4h, v8.4h
ST1 {v14.h}[1], [x0]
ADD x0, x0, x9
uMULL v26.4s, v0.4h, v8.4h
ST1 {v22.h}[1], [x0]
ADD x0, x0, x9
uMULL v24.4s, v2.4h, v9.4h
ST1 {v14.h}[2], [x0]
ADD x0, x0, x9
ushR v30.4s, v30.4s, #16
ST1 {v22.h}[2], [x0]
ADD x0, x0, x9
ushR v28.4s, v28.4s, #16
ST1 {v14.h}[3], [x0]
ADD x0, x0, x9
sMLAL v30.4s, v1.4h, v9.4h
ST1 {v22.h}[3], [x0]
ADD x0, x0, x9
sMLAL v28.4s, v3.4h, v8.4h
ST1 {v15.h}[0], [x5]
ADD x5, x5, x10
ushR v26.4s, v26.4s, #16
ST1 {v23.h}[0], [x5]
ADD x5, x5, x10
ushR v24.4s, v24.4s, #16
ST1 {v15.h}[1], [x5]
ADD x5, x5, x10
sMLAL v26.4s, v1.4h, v8.4h
ST1 {v23.h}[1], [x5]
ADD x5, x5, x10
sMLAL v24.4s, v3.4h, v9.4h
ST1 {v15.h}[2], [x5]
ADD x5, x5, x10
ADD v30.4s, v30.4s , v28.4s
ST1 {v23.h}[2], [x5]
ADD x5, x5, x10
NEG v30.4s, v30.4s
ST1 {v15.h}[3], [x5]
ADD x5, x5, x10
ST1 {v23.h}[3], [x5]
ADD x5, x5, x10
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
uMULL v22.4s, v4.4h, v8.4h
mov v24.16b, v28.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s, #16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s, #16
ushR v0.4s, v0.4s, #16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s, #16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s, #16
sshR v0.4s, v0.4s, #16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s, w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s, w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s, #16
ushR v26.4s, v26.4s, #16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s, #16
ushR v6.4s, v6.4s, #16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s, w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s, w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4]
ADD x4, x4, #16
ST1 { v0.4s}, [x4]
ADD x4, x4, #16
movi v1.2s, #0
// VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
ADD X1, X1, x12
// VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
ushR v8.4s, v8.4s, #16
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
ushR v30.4s, v30.4s, #16
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
LD2 { v10.4s, v11.4s}, [x6]
add X6, x6, x12
MOV v12.16b, v11.16b
sQshL v4.4s, v4.4s, #2
//VUZP.16 D10, D11
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
sQshL v6.4s, v6.4s, #2
// VUZP.16 D12, D13
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
SQADD v14.4s, v4.4s , v20.4s
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
SQADD v6.4s, v6.4s , v20.4s
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
sshR v14.4s, v14.4s, #16
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
sshR v6.4s, v6.4s, #16
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
mov v15.8b, v6.8b
sQshL v8.4s, v8.4s, v16.4s
LD2 { v4.4s, v5.4s}, [x8]
ADD x8, x8, #32
MOV v6.16b, v5.16b
sQshL v30.4s, v30.4s, v16.4s
// VUZP.16 D4, D5
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
SQSUB v8.4s, v8.4s , v24.4s
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
SQSUB v22.4s, v30.4s , v22.4s
sQshL v30.4s, v8.4s, #2
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
sQshL v22.4s, v22.4s, #2
SQADD v30.4s, v30.4s , v20.4s
SQADD v22.4s, v22.4s , v20.4s
sshR v30.4s, v30.4s, #16
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
sshR v22.4s, v22.4s, #16
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
mov v23.8b, v30.8b
SUBS x3, x3, #1
BNE CORE_LOOP
EPILOGUE:
ST1 {v14.h}[0], [x0]
ADD x0, x0, x9
uMULL v30.4s, v0.4h, v9.4h
ST1 {v22.h}[0], [x0]
ADD x0, x0, x9
uMULL v28.4s, v2.4h, v8.4h
ST1 {v14.h}[1], [x0]
ADD x0, x0, x9
uMULL v26.4s, v0.4h, v8.4h
ST1 {v22.h}[1], [x0]
ADD x0, x0, x9
uMULL v24.4s, v2.4h, v9.4h
ST1 {v14.h}[2], [x0]
ADD x0, x0, x9
ushR v30.4s, v30.4s, #16
ST1 {v22.h}[2], [x0]
ADD x0, x0, x9
ushR v28.4s, v28.4s, #16
ST1 {v14.h}[3], [x0]
ADD x0, x0, x9
sMLAL v30.4s, v1.4h, v9.4h
ST1 {v22.h}[3], [x0]
ADD x0, x0, x9
sMLAL v28.4s, v3.4h, v8.4h
ST1 {v15.h}[0], [x5]
ADD x5, x5, x10
ushR v26.4s, v26.4s, #16
ST1 {v23.h}[0], [x5]
ADD x5, x5, x10
ushR v24.4s, v24.4s, #16
ST1 {v15.h}[1], [x5]
ADD x5, x5, x10
sMLAL v26.4s, v1.4h, v8.4h
ST1 {v23.h}[1], [x5]
ADD x5, x5, x10
sMLAL v24.4s, v3.4h, v9.4h
ST1 {v15.h}[2], [x5]
ADD x5, x5, x10
ADD v30.4s, v30.4s , v28.4s
ST1 {v23.h}[2], [x5]
ADD x5, x5, x10
NEG v30.4s, v30.4s
ST1 {v15.h}[3], [x5]
ADD x5, x5, x10
ST1 {v23.h}[3], [x5]
ADD x5, x5, x10
SUB v28.4s, v24.4s , v26.4s
uMULL v22.4s, v4.4h, v8.4h
mov v26.16b, v30.16b
mov v24.16b, v28.16b
mov v26.16b, v30.16b
mov v24.16b, v28.16b
//VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s, #16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s, #16
ushR v0.4s, v0.4s, #16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s, #16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s, #16
sshR v0.4s, v0.4s, #16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s, w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s, w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s, #16
ushR v26.4s, v26.4s, #16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s, #16
ushR v6.4s, v6.4s, #16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s, w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s, w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
LD2 { v28.4s, v29.4s}, [x4]
MOV v30.16b, v29.16b
MOV v29.d[0], v28.d[1]
// VZIP.32 Q13, Q0
ZIP1 v19.4s, v26.4s, v0.4s
ZIP2 v0.4s, v26.4s, v0.4s
MOV v26.16b, v19.16b
ST1 { v26.4s}, [x4], #16
ST1 { v0.4s}, [x4], #16
movi v1.2s, #0
// VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
ushR v8.4s, v8.4s, #16
ushR v30.4s, v30.4s, #16
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
sQshL v4.4s, v4.4s, #2
sQshL v6.4s, v6.4s, #2
SQADD v14.4s, v4.4s , v20.4s
SQADD v6.4s, v6.4s , v20.4s
sshR v14.4s, v14.4s, #16
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
sshR v6.4s, v6.4s, #16
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
mov v15.8b, v6.8b
sQshL v8.4s, v8.4s, v16.4s
sQshL v30.4s, v30.4s, v16.4s
SQSUB v8.4s, v8.4s , v24.4s
SQSUB v22.4s, v30.4s , v22.4s
sQshL v30.4s, v8.4s, #2
sQshL v22.4s, v22.4s, #2
SQADD v30.4s, v30.4s , v20.4s
SQADD v22.4s, v22.4s , v20.4s
sshR v30.4s, v30.4s, #16
//VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
sshR v22.4s, v22.4s, #16
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
mov v23.8b, v30.8b
ST1 {v14.h}[0], [x0]
ADD x0, x0, x9
ST1 {v22.h}[0], [x0]
ADD x0, x0, x9
ST1 {v14.h}[1], [x0]
ADD x0, x0, x9
ST1 {v22.h}[1], [x0]
ADD x0, x0, x9
ST1 {v14.h}[2], [x0]
ADD x0, x0, x9
ST1 {v22.h}[2], [x0]
ADD x0, x0, x9
ST1 {v14.h}[3], [x0]
ADD x0, x0, x9
ST1 {v22.h}[3], [x0]
ADD x0, x0, x9
ST1 {v15.h}[0], [x5]
ADD x5, x5, x10
ST1 {v23.h}[0], [x5]
ADD x5, x5, x10
ST1 {v15.h}[1], [x5]
ADD x5, x5, x10
ST1 {v23.h}[1], [x5]
ADD x5, x5, x10
ST1 {v15.h}[2], [x5]
ADD x5, x5, x10
ST1 {v23.h}[2], [x5]
ADD x5, x5, x10
ST1 {v15.h}[3], [x5]
ADD x5, x5, x10
ST1 {v23.h}[3], [x5]
ADD x5, x5, x10
ARM_EPILOGUE:
ARM_LOOP:
LD2 { v0.4s, v1.4s}, [x1]
MOV v2.16b, v1.16b
//VUZP.16 D0, D1
UZP1 v19.8h, v0.8h, v0.8h
UZP2 v21.8h, v0.8h, v0.8h
MOV v0.d[0], v19.d[0]
MOV v0.d[1], v21.d[0]
//VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v2.d[1], v21.d[0]
rev64 v0.8h, v0.8h
MOV v1.d[0], v0.d[1]
rev64 v2.8h, v2.8h
MOV v3.d[0], v2.d[1]
LD2 {v8.4h, v9.4h}, [x2]
ADD x2, x2, #16
LD2 {v4.2s, v5.2s}, [x8]
ADD x8, x8, #16
MOV v6.16b, v5.16b
movi v5.2s, #0x00000000
movi v7.2s, #0x00000000
LD1 {v5.s}[0], [x8], #4
LD1 {v7.s}[0], [x8]
MOV x12, #16
MOV v4.d[1], v5.d[0]
MOV v6.d[1], v7.d[0]
// VUZP.16 D4, D5
UZP1 v19.8h, v4.8h, v4.8h
UZP2 v21.8h, v4.8h, v4.8h
MOV v4.d[0], v19.d[0]
MOV v5.d[0], v21.d[0]
// VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
ADD x6, x6, #16
MOV x12, #-4
LD2 {v11.2s, v12.2s}, [x6]
ADD x6, x6, x12
MOV v13.16b, v12.16b
movi v10.2s, #0x00000000
LD1 {v12.s}[1], [x6]
ADD x6, x6, x12
LD1 {v10.s}[1], [x6]
ADD x6, x6, x12
LD1 {v12.s}[0], [x6]
ADD x6, x6, x12
MOV v10.d[1], v11.d[0]
MOV v12.d[1], v13.d[0]
//VUZP.16 D10, D11
UZP1 v19.8h, v10.8h, v10.8h
UZP2 v21.8h, v10.8h, v10.8h
MOV v10.d[0], v19.d[0]
MOV v10.d[1], v21.d[0]
//VUZP.16 D12, D13
UZP1 v19.8h, v12.8h, v12.8h
UZP2 v21.8h, v12.8h, v12.8h
MOV v12.d[0], v19.d[0]
MOV v12.d[1], v21.d[0]
rev64 v10.8h, v10.8h
MOV v11.d[0], v10.d[1]
rev64 v12.8h, v12.8h
MOV v13.d[0], v12.d[1]
uMULL v30.4s, v0.4h, v9.4h
uMULL v28.4s, v2.4h, v8.4h
uMULL v26.4s, v0.4h, v8.4h
uMULL v24.4s, v2.4h, v9.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
sMLAL v30.4s, v1.4h, v9.4h
sMLAL v28.4s, v3.4h, v8.4h
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v26.4s, v1.4h, v8.4h
sMLAL v24.4s, v3.4h, v9.4h
ADD v30.4s, v30.4s , v28.4s
NEG v30.4s, v30.4s
uMULL v22.4s, v4.4h, v8.4h
SUB v28.4s, v24.4s , v26.4s
mov v26.16b, v30.16b
mov v24.16b, v28.16b
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
//VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v2.4s, v24.4h, v18.4h
uMULL v0.4s, v26.4h, v18.4h
ushR v22.4s, v22.4s, #16
sMLAL v22.4s, v5.4h, v8.4h
ushR v2.4s, v2.4s, #16
ushR v0.4s, v0.4s, #16
sMLAL v2.4s, v25.4h, v18.4h
sMLAL v0.4s, v27.4h, v18.4h
uMULL v24.4s, v4.4h, v9.4h
uMULL v26.4s, v6.4h, v8.4h
NEG v2.4s, v2.4s
ADD v28.4s, v28.4s , v0.4s
ADD v30.4s, v30.4s , v2.4s
uMULL v0.4s, v6.4h, v9.4h
sshR v24.4s, v24.4s, #16
sMLAL v24.4s, v5.4h, v9.4h
sshR v26.4s, v26.4s, #16
sshR v0.4s, v0.4s, #16
sMLAL v26.4s, v7.4h, v8.4h
sMLAL v0.4s, v7.4h, v9.4h
ADD v22.4s, v22.4s , v0.4s
NEG v22.4s, v22.4s
SUB v24.4s, v26.4s , v24.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v14.4s, w11
SQADD v28.4s, v28.4s , v14.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v0.4s, w11
sQshL v28.4s, v28.4s, v0.4s
mov v0.16b, v22.16b
mov v14.16b, v24.16b
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
// VUZP.16 D24, D25
UZP1 v19.8h, v24.8h, v24.8h
UZP2 v21.8h, v24.8h, v24.8h
MOV v24.d[0], v19.d[0]
MOV v25.d[0], v21.d[0]
uMULL v8.4s, v24.4h, v18.4h
uMULL v26.4s, v22.4h, v18.4h
NEG v2.4s, v30.4s
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
// VUZP.16 D2, D3
UZP1 v19.8h, v2.8h, v2.8h
UZP2 v21.8h, v2.8h, v2.8h
MOV v2.d[0], v19.d[0]
MOV v3.d[0], v21.d[0]
uMULL v4.4s, v30.4h, v12.4h
uMULL v6.4s, v2.4h, v13.4h
ushR v8.4s, v8.4s, #16
ushR v26.4s, v26.4s, #16
sMLAL v8.4s, v25.4h, v18.4h
sMLAL v26.4s, v23.4h, v18.4h
ushR v4.4s, v4.4s, #16
ushR v6.4s, v6.4s, #16
MOV v19.d[0], v30.d[1]
sMLAL v4.4s, v19.4h, v12.4h
sMLAL v6.4s, v3.4h, v13.4h
NEG v8.4s, v8.4s
ADD v14.4s, v14.4s , v26.4s
ADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #120]
//sxtw x11,w11
MOV w11, w26
dup v8.4s, w11
SQADD v0.4s, v0.4s , v8.4s
//LDR w11, [sp, #116]
//sxtw x11,w11
MOV w11, w25
dup v26.4s, w11
sQshL v0.4s, v0.4s, v26.4s
mov v26.16b, v28.16b
MOV x6, x4
LD1 {v28.2s, v29.2s}, [x4], #16
movi v19.2s, #0x00000000
LD1 {v30.s}[0], [x4], #4
LD1 {v30.s}[1], [x4], #4
LD1 {v19.s}[0], [x4], #4
MOV v28.d[1], v29.d[0]
MOV v30.d[1], v19.d[0]
//VUZP.32 Q14, Q15
UZP1 v19.4s, v28.4s, v30.4s
UZP2 v30.4s, v28.4s, v30.4s
MOV v28.16b, v19.16b
MOV v29.d[0], v28.d[1]
ST1 {v26.s}[0], [x6], #4
ST1 {v0.s}[0], [x6], #4
ST1 {v26.s}[1], [x6], #4
ST1 {v0.s}[1], [x6], #4
ST1 {v26.s}[2], [x6], #4
ST1 {v0.s}[2], [x6], #4
ST1 {v26.s}[3], [x6], #4
movi v1.2s, #0
//VADDL.S16 Q0, D13, D1
SADDL v0.4s, v13.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v26.2d, v28.2s, v0.2s
Sqxtn v8.2s, v26.2d
sMULL v26.2d, v29.2s, v1.2s
Sqxtn v9.2s, v26.2d
MOV v8.d[1], v9.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D12, D1
SADDL v0.4s, v12.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v24.2d, v28.2s, v0.2s
Sqxtn v26.2s, v24.2d
sMULL v24.2d, v29.2s, v1.2s
Sqxtn v27.2s, v24.2d
MOV v26.d[1], v27.d[0]
sQshL v4.4s, v4.4s, v16.4s
sQshL v6.4s, v6.4s, v16.4s
SQSUB v4.4s, v4.4s , v8.4s
SQSUB v6.4s, v6.4s , v26.4s
NEG v26.4s, v14.4s
//VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
// VUZP.16 D26, D27
UZP1 v19.8h, v26.8h, v26.8h
UZP2 v21.8h, v26.8h, v26.8h
MOV v26.d[0], v19.d[0]
MOV v27.d[0], v21.d[0]
movi v1.2s, #0
//VADDL.S16 Q0, D10, D1
SADDL v0.4s, v10.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v22.2d, v30.2s, v0.2s
Sqxtn v24.2s, v22.2d
sMULL2 v22.2d, v30.4s, v0.4s
Sqxtn v25.2s, v22.2d
MOV v24.d[1], v25.d[0]
movi v1.2s, #0
// VADDL.S16 Q0, D11, D1
SADDL v0.4s, v11.4h, v1.4h
MOV v1.d[0], v0.d[1]
sMULL v8.2d, v30.2s, v0.2s
Sqxtn v22.2s, v8.2d
sMULL2 v8.2d, v30.4s, v0.4s
Sqxtn v23.2s, v8.2d
MOV v22.d[1], v23.d[0]
uMULL v8.4s, v26.4h, v11.4h
uMULL v30.4s, v14.4h, v10.4h
ushR v8.4s, v8.4s, #16
ushR v30.4s, v30.4s, #16
sMLAL v8.4s, v27.4h, v11.4h
sMLAL v30.4s, v15.4h, v10.4h
sQshL v4.4s, v4.4s, #2
sQshL v6.4s, v6.4s, #2
SQADD v14.4s, v4.4s , v20.4s
SQADD v6.4s, v6.4s , v20.4s
sshR v14.4s, v14.4s, #16
// VUZP.16 D14, D15
UZP1 v19.8h, v14.8h, v14.8h
UZP2 v21.8h, v14.8h, v14.8h
MOV v14.d[0], v19.d[0]
MOV v15.d[0], v21.d[0]
sshR v6.4s, v6.4s, #16
//VUZP.16 D6, D7
UZP1 v19.8h, v6.8h, v6.8h
UZP2 v21.8h, v6.8h, v6.8h
MOV v6.d[0], v19.d[0]
MOV v7.d[0], v21.d[0]
mov v15.8b, v6.8b
sQshL v8.4s, v8.4s, v16.4s
sQshL v30.4s, v30.4s, v16.4s
SQSUB v8.4s, v8.4s , v24.4s
SQSUB v22.4s, v30.4s , v22.4s
sQshL v30.4s, v8.4s, #2
sQshL v22.4s, v22.4s, #2
SQADD v30.4s, v30.4s , v20.4s
SQADD v22.4s, v22.4s , v20.4s
sshR v30.4s, v30.4s, #16
// VUZP.16 D30, D31
UZP1 v19.8h, v30.8h, v30.8h
UZP2 v21.8h, v30.8h, v30.8h
MOV v30.d[0], v19.d[0]
MOV v30.d[1], v21.d[0]
sshR v22.4s, v22.4s, #16
// VUZP.16 D22, D23
UZP1 v19.8h, v22.8h, v22.8h
UZP2 v21.8h, v22.8h, v22.8h
MOV v22.d[0], v19.d[0]
MOV v23.d[0], v21.d[0]
mov v23.8b, v30.8b
ST1 {v14.h}[0], [x0]
ADD x0, x0, x9
ST1 {v22.h}[0], [x0]
ADD x0, x0, x9
ST1 {v14.h}[1], [x0]
ADD x0, x0, x9
ST1 {v22.h}[1], [x0]
ADD x0, x0, x9
ST1 {v14.h}[2], [x0]
ADD x0, x0, x9
ST1 {v22.h}[2], [x0]
ADD x0, x0, x9
ST1 {v14.h}[3], [x0]
ADD x0, x0, x9
ST1 {v15.h}[0], [x5]
ADD x5, x5, x10
ST1 {v23.h}[0], [x5]
ADD x5, x5, x10
ST1 {v15.h}[1], [x5]
ADD x5, x5, x10
ST1 {v23.h}[1], [x5]
ADD x5, x5, x10
ST1 {v15.h}[2], [x5]
ADD x5, x5, x10
ST1 {v23.h}[2], [x5]
ADD x5, x5, x10
ST1 {v15.h}[3], [x5]
ADD x5, x5, x10
// VPOP {d8 - d15}
// LDMFD sp!, {x4-x12}
//ldp x19, x20,[sp],#16
pop_v_regs
ret
//BX x14