blob: 7fae0bd3237c74ba15ba6506c079cebedbddc79d [file] [log] [blame]
///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X22, X23, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X20, X21, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X20, X21, [sp], #16
ldp X16, X17, [sp], #16
ldp X22, X23, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.macro swp reg1, reg2
MOV X16, \reg1
MOV \reg1, \reg2
MOV \reg2, x16
.endm
.text
.global ixheaacd_pretwiddle_compute_armv8
ixheaacd_pretwiddle_compute_armv8:
push_v_regs
LSL x7, x4, #4
ADD x7, x2, x7
SUB x7, x7, #4
LDR x22, =7500
ADD x3, x3, x22
MVN w5, w5
ADD w5, w5, #1
ARM_PROLOGUE:
LDRH w21, [x3]
LDRH w22, [x3, #2]
LSL w22, w22, #16
LSL w21, w21, #16
LDR w8, [x3], #4
LDR w9, [x0], #4
SMULL X12, w9, w21
ASR X12, x12, #32
LDR w10, [x1], #-4
SMULL X11, w9, w22
ASR X11, x11, #32
SMULL X23, w10, w22
ASR X23, x23, #32
ADD w9, w12, w23
SMULL X6, w10, w21
ASR X6, x6, #32
MVN w9, w9
ADD w9, w9, #1
SUB w11, w11, w6
CMP w5, #0
BGT NEXT
MVN w8, w5
ADD w8, w8, #1
ASR w11, w11, w8
ASR w9, w9, w8
B NEXT1
NEXT:
LSL w11, w11, w5
LSL w9, w9, w5
NEXT1:
STR w9, [x2], #4
STR w11, [x2], #4
CMP X4, #0x100
BNE NXT
MOV X6, #4
B NXT1
NXT:
MOV X6, #32
ADD X3, X3, #28
NXT1:
SUB X4, X4, #1
ASR X4, X4, #2
SUB x7, x7, #28
NEON_PROLOGUE:
MOV x8, #-32
dup v14.4s, w5
SUB X1, X1, #28
LD2 {v8.h, v9.h}[0], [x3], x6
LD2 {v8.h, v9.h}[1], [x3], x6
LD2 {v8.h, v9.h}[2], [x3], x6
LD2 {v8.h, v9.h}[3], [x3], x6
rev64 v10.4h, v8.4h
rev64 v11.4h, v9.4h
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
rev64 v4.4h, v4.4h
rev64 v5.4h, v5.4h
uMULL v30.4s, v2.4h, v9.4h
uMULL v28.4s, v4.4h, v9.4h
uMULL v26.4s, v2.4h, v8.4h
uMULL v24.4s, v4.4h, v8.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v9.4h
sMLAL v28.4s, v5.4h, v9.4h
sMLAL v26.4s, v3.4h, v8.4h
sMLAL v24.4s, v5.4h, v8.4h
ADD v28.4s, v26.4s , v28.4s
NEG v28.4s, v28.4s
SUB v30.4s, v30.4s , v24.4s
uMULL v22.4s, v0.4h, v11.4h
uMULL v20.4s, v6.4h, v11.4h
uMULL v18.4s, v0.4h, v10.4h
uMULL v16.4s, v6.4h, v10.4h
ushR v22.4s, v22.4s, #16
ushR v20.4s, v20.4s, #16
ushR v18.4s, v18.4s, #16
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v1.4h, v11.4h
LD2 {v8.h, v9.h}[0], [x3], x6
sMLAL v20.4s, v7.4h, v11.4h
LD2 {v8.h, v9.h}[1], [x3], x6
sMLAL v18.4s, v1.4h, v10.4h
LD2 {v8.h, v9.h}[2], [x3], x6
sMLAL v16.4s, v7.4h, v10.4h
LD2 {v8.h, v9.h}[3], [x3], x6
ADD v20.4s, v20.4s , v18.4s
NEG v20.4s, v20.4s
rev64 v10.4h, v8.4h
rev64 v11.4h, v9.4h
SUB v22.4s, v16.4s , v22.4s
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
sshL v20.4s, v20.4s, v14.4s
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
sshL v22.4s, v22.4s, v14.4s
rev64 v4.4h, v4.4h
rev64 v5.4h, v5.4h
sshL v18.4s, v30.4s, v14.4s
sshL v16.4s, v28.4s, v14.4s
SUB X4, X4, #2
CORE_LOOP:
uMULL v30.4s, v2.4h, v9.4h
MOV v17.16B, v18.16B
ST2 { v16.4s, v17.4s}, [x2]
ADD x2, x2, #32
uMULL v28.4s, v4.4h, v9.4h
uMULL v26.4s, v2.4h, v8.4h
MOV v21.16B, v22.16B
ST2 { v20.4s, v21.4s}, [x7], x8
uMULL v24.4s, v4.4h, v8.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v9.4h
sMLAL v28.4s, v5.4h, v9.4h
sMLAL v26.4s, v3.4h, v8.4h
sMLAL v24.4s, v5.4h, v8.4h
ADD v28.4s, v26.4s , v28.4s
NEG v28.4s, v28.4s
SUB v30.4s, v30.4s , v24.4s
uMULL v22.4s, v0.4h, v11.4h
LD2 {v8.h, v9.h}[0], [x3], x6
uMULL v20.4s, v6.4h, v11.4h
uMULL v18.4s, v0.4h, v10.4h
LD2 {v8.h, v9.h}[1], [x3], x6
uMULL v16.4s, v6.4h, v10.4h
ushR v22.4s, v22.4s, #16
LD2 {v8.h, v9.h}[2], [x3], x6
ushR v20.4s, v20.4s, #16
ushR v18.4s, v18.4s, #16
LD2 {v8.h, v9.h}[3], [x3], x6
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v1.4h, v11.4h
sMLAL v20.4s, v7.4h, v11.4h
sMLAL v18.4s, v1.4h, v10.4h
sMLAL v16.4s, v7.4h, v10.4h
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
ADD v20.4s, v20.4s , v18.4s
NEG v20.4s, v20.4s
rev64 v10.4h, v8.4h
rev64 v11.4h, v9.4h
SUB v22.4s, v16.4s , v22.4s
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
sshL v20.4s, v20.4s, v14.4s
sshL v22.4s, v22.4s, v14.4s
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
sshL v18.4s, v30.4s, v14.4s
rev64 v4.4h, v4.4h
rev64 v5.4h, v5.4h
sshL v16.4s, v28.4s, v14.4s
SUBS x4, x4, #1
BNE CORE_LOOP
NEON_EPILOGUE:
uMULL v30.4s, v2.4h, v9.4h
MOV v17.16B, v18.16B
ST2 { v16.4s, v17.4s}, [x2]
ADD x2, x2, #32
uMULL v28.4s, v4.4h, v9.4h
uMULL v26.4s, v2.4h, v8.4h
MOV v21.16B, v22.16B
ST2 { v20.4s, v21.4s}, [x7], x8
uMULL v24.4s, v4.4h, v8.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v9.4h
sMLAL v28.4s, v5.4h, v9.4h
sMLAL v26.4s, v3.4h, v8.4h
sMLAL v24.4s, v5.4h, v8.4h
ADD v28.4s, v26.4s , v28.4s
NEG v28.4s, v28.4s
SUB v30.4s, v30.4s , v24.4s
uMULL v22.4s, v0.4h, v11.4h
uMULL v20.4s, v6.4h, v11.4h
uMULL v18.4s, v0.4h, v10.4h
uMULL v16.4s, v6.4h, v10.4h
ushR v22.4s, v22.4s, #16
ushR v20.4s, v20.4s, #16
ushR v18.4s, v18.4s, #16
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v1.4h, v11.4h
sMLAL v20.4s, v7.4h, v11.4h
sMLAL v18.4s, v1.4h, v10.4h
sMLAL v16.4s, v7.4h, v10.4h
ADD v20.4s, v20.4s , v18.4s
NEG v20.4s, v20.4s
SUB v22.4s, v16.4s , v22.4s
sshL v20.4s, v20.4s, v14.4s
sshL v22.4s, v22.4s, v14.4s
sshL v18.4s, v30.4s, v14.4s
sshL v16.4s, v28.4s, v14.4s
MOV v17.16B, v18.16B
ST2 { v16.4s, v17.4s}, [x2]
ADD x2, x2, #32
MOV v21.16B, v22.16B
ST2 { v20.4s, v21.4s}, [x7], x8
RESIDUE_NEON:
MOV x10, #-16
movi v3.2s, #0x00000000
movi v4.2s, #0x00000000
LD2 {v21.2s, v22.2s}, [x0], #16
MOV v0.8B, v21.8B
MOV v2.8B, v22.8B
LD1 {v1.s}[0], [x0], #4;
LD1 {v3.s}[0], [x0], #4;
LD1 {v1.s}[1], [x0]
MOV v21.8B, v0.8B
UZP1 v0.4h, v21.4h, v1.4h
UZP2 v1.4h, v21.4h, v1.4h
MOV v21.8B, v2.8B
UZP1 v2.4h, v21.4h, v3.4h
UZP2 v3.4h, v21.4h, v3.4h
ADD x1, x1, #4
LD1 {v6.s}[0], [x1], #4
LD1 {v4.s}[1], [x1], #4
LD1 {v6.s}[1], [x1], #4
LD2 {v21.2s, v22.2s}, [x1], #16
MOV v5.8B, v21.8B
MOV v7.8B, v22.8B
MOV v21.8B, v4.8B
UZP1 v4.4h, v21.4h, v5.4h
UZP2 v5.4h, v21.4h, v5.4h
MOV v21.8B, v6.8B
UZP1 v6.4h, v21.4h, v7.4h
UZP2 v7.4h, v21.4h, v7.4h
rev64 v0.4h, v0.4h
rev64 v1.4h, v1.4h
rev64 v4.4h, v4.4h
rev64 v5.4h, v5.4h
LD2 {v8.h, v9.h}[0], [x3], x6
LD2 {v8.h, v9.h}[1], [x3], x6
LD2 {v8.h, v9.h}[2], [x3], x6
LD2 {v8.h, v9.h}[3], [x3], x6
rev64 v10.4h, v8.4h
rev64 v11.4h, v9.4h
uMULL v30.4s, v2.4h, v9.4h
uMULL v28.4s, v4.4h, v9.4h
uMULL v26.4s, v2.4h, v8.4h
uMULL v24.4s, v4.4h, v8.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v9.4h
sMLAL v28.4s, v5.4h, v9.4h
sMLAL v26.4s, v3.4h, v8.4h
sMLAL v24.4s, v5.4h, v8.4h
ADD v28.4s, v26.4s , v28.4s
NEG v28.4s, v28.4s
SUB v30.4s, v30.4s , v24.4s
uMULL v22.4s, v0.4h, v11.4h
uMULL v20.4s, v6.4h, v11.4h
uMULL v18.4s, v0.4h, v10.4h
uMULL v16.4s, v6.4h, v10.4h
ushR v22.4s, v22.4s, #16
ushR v20.4s, v20.4s, #16
ushR v18.4s, v18.4s, #16
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v1.4h, v11.4h
sMLAL v20.4s, v7.4h, v11.4h
sMLAL v18.4s, v1.4h, v10.4h
sMLAL v16.4s, v7.4h, v10.4h
ADD v20.4s, v20.4s , v18.4s
NEG v20.4s, v20.4s
SUB v22.4s, v16.4s , v22.4s
sshL v20.4s, v20.4s, v14.4s
sshL v22.4s, v22.4s, v14.4s
sshL v18.4s, v30.4s, v14.4s
sshL v16.4s, v28.4s, v14.4s
MOV v21.16B, v22.16B
ST2 { v20.4s, v21.4s}, [x7]
mov v17.16B, v18.16B
ST2 {v16.2s, v17.2s}, [x2]
ADD x2, x2, #16
ST2 {v16.s, v17.s}[2], [x2]
ADD x2, x2, #8
END1:
pop_v_regs
ret