blob: c28b9da8c92984afb4c6edbcd18e0495bf746d92 [file] [log] [blame]
///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp x21, x22, [sp, #-16]!
stp x23, x24, [sp, #-16]!
.endm
.macro pop_v_regs
ldp x23, x24, [sp], #16
ldp x21, x22, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.macro swp reg1, reg2
MOV X16, \reg1
MOV \reg1, \reg2
MOV \reg2, x16
.endm
.text
.global ixheaacd_post_twiddle_armv8
ixheaacd_post_twiddle_armv8:
push_v_regs
ARM_PROLOGUE:
CMP w3, #0x400
MOV x21, #7500
ADD x2, x2, x21
BLT NEXT
MOV w4, #50
MOV w5, #-50
MOV x6, #4
dup v10.4h, w4
B NEXT1
NEXT:
MOV w4, #0x192
MOV w5, #0xfe6e
MOV x6, #32
dup v10.4h, w4
NEXT1:
LDR w9, [x2]
LSL W22, W9, #16
AND W21, W9, #0xFFFF0000
LDR w7, [x1], #4
LDR w8, [x1], #4
ADD x2, x2, x6
SMULL X11, w8, w21
ASR X11, x11, #32
SMULL X10, w8, w22
ASR X10, x10, #32
SMULL X12, w7, w21
ASR X12, x12, #32
SMULL X23, w7, w22
ASR X23, x23, #32
ADD w8, w11, w23
SUB w10, w10, w12
MVN w8, w8
ADD w8, w8, #1
LSL w21, w5, #16
LSL w22, w4, #16
SMULL X23, w10, w21
ASR X23, x23, #32
ADD w9, w8, w23
SMULL X23, w8, w22
ASR X23, x23, #32
ADD w11, w10, w23
LSL x7, x3, #2
ADD x7, x0, x7
SUB x7, x7, #4
STR w11, [x7], #-4
STR w9, [x0], #4
LSL x5, x3, #2
ADD x5, x1, x5
SUB x5, x5, #40
SUB w3, w3, #1
ASR w3, w3, #4
SUB x7, x7, #28
MOV x8, #-32
NEON_PROLOGUE:
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
LD2 {v8.h, v9.h}[0], [x2], x6
LD2 {v8.h, v9.h}[1], [x2], x6
LD2 {v8.h, v9.h}[2], [x2], x6
LD2 {v8.h, v9.h}[3], [x2], x6
rev64 v12.4h, v8.4h
rev64 v13.4h, v9.4h
uMULL v30.4s, v2.4h, v13.4h
uMULL v28.4s, v0.4h, v13.4h
uMULL v26.4s, v2.4h, v12.4h
uMULL v24.4s, v0.4h, v12.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v13.4h
sMLAL v28.4s, v1.4h, v13.4h
sMLAL v26.4s, v3.4h, v12.4h
sMLAL v24.4s, v1.4h, v12.4h
uMULL v22.4s, v6.4h, v9.4h
uMULL v20.4s, v4.4h, v9.4h
ADD v28.4s, v28.4s , v26.4s
SUB v30.4s, v30.4s , v24.4s
NEG v28.4s, v28.4s
uMULL v18.4s, v6.4h, v8.4h
uMULL v16.4s, v4.4h, v8.4h
mov v31.8b, v30.8b
mov v27.D[0], v30.D[1]
ushR v22.4s, v22.4s, #16
mov v24.8b, v28.8b
mov v25.D[0], v28.D[1]
ushR v20.4s, v20.4s, #16
UZP1 v26.4h, v31.4h, v27.4h
UZP2 v27.4h, v31.4h, v27.4h
ushR v18.4s, v18.4s, #16
mov v31.8B , v24.8B
UZP1 v24.4h, v31.4h, v25.4h
UZP2 v25.4h, v31.4h, v25.4h
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v7.4h, v9.4h
sMLAL v20.4s, v5.4h, v9.4h
sMLAL v18.4s, v7.4h, v8.4h
sMLAL v16.4s, v5.4h, v8.4h
LD2 {v8.h, v9.h}[0], [x2], x6
uMULL v0.4s, v26.4h, v10.4h
LD2 {v8.h, v9.h}[1], [x2], x6
uMULL v2.4s, v24.4h, v10.4h
LD2 {v8.h, v9.h}[2], [x2], x6
ADD v22.4s, v22.4s , v16.4s
LD2 {v8.h, v9.h}[3], [x2], x6
SUB v20.4s, v18.4s , v20.4s
rev64 v12.4h, v8.4h
rev64 v13.4h, v9.4h
NEG v22.4s, v22.4s
mov v18.8b, v22.8b
mov v19.D[0], v22.D[1]
ushR v0.4s, v0.4s, #16
mov v16.16b, v20.16b
mov v17.D[0], v20.D[1]
ushR v2.4s, v2.4s, #16
MOV v31.8b, v18.8b
UZP1 v18.4h, v31.4h, v19.4h
UZP2 v19.4h, v31.4h, v19.4h
sMLAL v0.4s, v27.4h, v10.4h
MOV v31.8b, v16.8b
UZP1 v16.4h, v31.4h, v17.4h
UZP2 v17.4h, v31.4h, v17.4h
sMLAL v2.4s, v25.4h, v10.4h
uMULL v4.4s, v18.4h, v10.4h
uMULL v6.4s, v16.4h, v10.4h
NEG v0.4s, v0.4s
ADD v14.4s, v30.4s , v2.4s
ADD v26.4s, v28.4s , v0.4s
rev64 v14.4s, v14.4s
ushR v4.4s, v4.4s, #16
swp v14.D[0], v14.D[1]
ushR v6.4s, v6.4s, #16
sMLAL v4.4s, v19.4h, v10.4h
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
sMLAL v6.4s, v17.4h, v10.4h
SUB x3, x3, #2
ADD v24.4s, v20.4s , v4.4s
rev64 v24.4s, v24.4s
NEG v16.4s, v6.4s
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
swp v24.D[0], v24.D[1]
ADD v16.4s, v22.4s , v16.4s
CORE_LOOP:
uMULL v30.4s, v2.4h, v13.4h
MOV v25.16B, v24.16B
ST2 { v25.4s, v26.4s}, [x7], x8
uMULL v28.4s, v0.4h, v13.4h
uMULL v26.4s, v2.4h, v12.4h
MOV v15.16B, v14.16B
ST2 { v15.4s, v16.4s}, [x0], #32
uMULL v24.4s, v0.4h, v12.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v13.4h
sMLAL v28.4s, v1.4h, v13.4h
sMLAL v26.4s, v3.4h, v12.4h
sMLAL v24.4s, v1.4h, v12.4h
uMULL v22.4s, v6.4h, v9.4h
uMULL v20.4s, v4.4h, v9.4h
ADD v28.4s, v28.4s , v26.4s
SUB v30.4s, v30.4s , v24.4s
NEG v28.4s, v28.4s
uMULL v18.4s, v6.4h, v8.4h
uMULL v16.4s, v4.4h, v8.4h
mov v26.8b, v30.8b
mov v27.D[0], v30.D[1]
ushR v22.4s, v22.4s, #16
mov v24.8b, v28.8b
mov v25.D[0], v28.D[1]
ushR v20.4s, v20.4s, #16
MOV v31.8b, v26.8b
UZP1 v26.4h, v31.4h, v27.4h
UZP2 v27.4h, v31.4h, v27.4h
ushR v18.4s, v18.4s, #16
MOV v31.8b, v24.8b
UZP1 v24.4h, v31.4h, v25.4h
UZP2 v25.4h, v31.4h, v25.4h
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v7.4h, v9.4h
sMLAL v20.4s, v5.4h, v9.4h
sMLAL v18.4s, v7.4h, v8.4h
sMLAL v16.4s, v5.4h, v8.4h
LD2 {v8.h, v9.h}[0], [x2], x6
uMULL v0.4s, v26.4h, v10.4h
LD2 {v8.h, v9.h}[1], [x2], x6
uMULL v2.4s, v24.4h, v10.4h
LD2 {v8.h, v9.h}[2], [x2], x6
ADD v22.4s, v22.4s , v16.4s
LD2 {v8.h, v9.h}[3], [x2], x6
SUB v20.4s, v18.4s , v20.4s
rev64 v12.4h, v8.4h
rev64 v13.4h, v9.4h
NEG v22.4s, v22.4s
mov v18.8b, v22.8b
mov v19.D[0], v22.D[1]
ushR v0.4s, v0.4s, #16
mov v16.8b, v20.8b
mov v17.D[0], v20.D[1]
ushR v2.4s, v2.4s, #16
MOV v31.8b, v18.8b
UZP1 v18.4h, v31.4h, v19.4h
UZP2 v19.4h, v31.4h, v19.4h
sMLAL v0.4s, v27.4h, v10.4h
MOV v31.8b, v16.8b
UZP1 v16.4h, v31.4h, v17.4h
UZP2 v17.4h, v31.4h, v17.4h
sMLAL v2.4s, v25.4h, v10.4h
uMULL v4.4s, v18.4h, v10.4h
uMULL v6.4s, v16.4h, v10.4h
NEG v0.4s, v0.4s
ADD v14.4s, v30.4s , v2.4s
ADD v26.4s, v28.4s , v0.4s
rev64 v14.4s, v14.4s
ushR v4.4s, v4.4s, #16
swp v14.D[0], v14.D[1]
ushR v6.4s, v6.4s, #16
sMLAL v4.4s, v19.4h, v10.4h
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
sMLAL v6.4s, v17.4h, v10.4h
ADD v24.4s, v20.4s , v4.4s
rev64 v24.4s, v24.4s
NEG v16.4s, v6.4s
LD4 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
swp v24.D[0], v24.D[1]
ADD v16.4s, v22.4s , v16.4s
SUBS x3, x3, #1
BNE CORE_LOOP
NEON_EPILOGUE:
uMULL v30.4s, v2.4h, v13.4h
MOV v25.16B, v24.16B
ST2 { v25.4s, v26.4s}, [x7], x8
uMULL v28.4s, v0.4h, v13.4h
uMULL v26.4s, v2.4h, v12.4h
MOV v15.16B, v14.16B
ST2 { v15.4s, v16.4s}, [x0], #32
uMULL v24.4s, v0.4h, v12.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v13.4h
sMLAL v28.4s, v1.4h, v13.4h
sMLAL v26.4s, v3.4h, v12.4h
sMLAL v24.4s, v1.4h, v12.4h
uMULL v22.4s, v6.4h, v9.4h
uMULL v20.4s, v4.4h, v9.4h
ADD v28.4s, v28.4s , v26.4s
SUB v30.4s, v30.4s , v24.4s
NEG v28.4s, v28.4s
uMULL v18.4s, v6.4h, v8.4h
uMULL v16.4s, v4.4h, v8.4h
mov v26.8b, v30.8b
mov v27.D[0], v30.D[1]
ushR v22.4s, v22.4s, #16
mov v24.16b, v28.16b
mov v25.D[0], v28.D[1]
ushR v20.4s, v20.4s, #16
mov v31.8b, v26.8b
UZP1 v26.4h, v31.4h, v27.4h
UZP2 v27.4h, v31.4h, v27.4h
ushR v18.4s, v18.4s, #16
mov v31.8b, v24.8b
UZP1 v24.4h, v31.4h, v25.4h
UZP2 v25.4h, v31.4h, v25.4h
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v7.4h, v9.4h
sMLAL v20.4s, v5.4h, v9.4h
sMLAL v18.4s, v7.4h, v8.4h
sMLAL v16.4s, v5.4h, v8.4h
uMULL v0.4s, v26.4h, v10.4h
uMULL v2.4s, v24.4h, v10.4h
ADD v22.4s, v22.4s , v16.4s
SUB v20.4s, v18.4s , v20.4s
NEG v22.4s, v22.4s
mov v18.16b, v22.16b
ushR v0.4s, v0.4s, #16
mov v16.16b, v20.16b
ushR v2.4s, v2.4s, #16
mov v31.16b, v18.16b
mov v19.d[0], v31.d[1]
UZP1 v18.4h, v31.4h, v19.4h
UZP2 v19.4h, v31.4h, v19.4h
sMLAL v0.4s, v27.4h, v10.4h
mov v31.16b, v16.16b
mov v17.d[0], v31.d[1]
UZP1 v16.4h, v31.4h, v17.4h
UZP2 v17.4h, v31.4h, v17.4h
sMLAL v2.4s, v25.4h, v10.4h
uMULL v4.4s, v18.4h, v10.4h
uMULL v6.4s, v16.4h, v10.4h
NEG v0.4s, v0.4s
ADD v14.4s, v30.4s , v2.4s
ADD v26.4s, v28.4s , v0.4s
rev64 v14.4s, v14.4s
ushR v4.4s, v4.4s, #16
swp v14.D[0], v14.D[1]
ushR v6.4s, v6.4s, #16
sMLAL v4.4s, v19.4h, v10.4h
sMLAL v6.4s, v17.4h, v10.4h
ADD v24.4s, v20.4s , v4.4s
rev64 v24.4s, v24.4s
NEG v16.4s, v6.4s
swp v24.D[0], v24.D[1]
ADD v16.4s, v22.4s , v16.4s
MOV v25.16B, v24.16B
MOV v15.16B, v14.16B
ST2 { v15.4s, v16.4s}, [x0], #32
ST2 { v25.4s, v26.4s}, [x7], x8
LD4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
movi v6.2s, #0x00000000
movi v7.2s, #0x00000000
LD2 {v4.2s, v5.2s}, [x1], #16
LD2 {v6.s, v7.s}[0], [x1]
LD2 {v8.h, v9.h}[0], [x2], x6
LD2 {v8.h, v9.h}[1], [x2], x6
LD2 {v8.h, v9.h}[2], [x2], x6
LD2 {v8.h, v9.h}[3], [x2], x6
rev64 v12.8h, v8.8h
rev64 v13.8h, v9.8h
swp v5.D[0], v6.D[0]
MOV v30.8B, V4.8B
UZP1 v4.4h, v30.4h, v5.4h
UZP2 v5.4h, v30.4h, v5.4h
MOV v30.8B, V6.8B
UZP1 v6.4h, v30.4h, v7.4h
UZP2 v7.4h, v30.4h, v7.4h
uMULL v30.4s, v2.4h, v13.4h
uMULL v28.4s, v0.4h, v13.4h
uMULL v26.4s, v2.4h, v12.4h
uMULL v24.4s, v0.4h, v12.4h
ushR v30.4s, v30.4s, #16
ushR v28.4s, v28.4s, #16
ushR v26.4s, v26.4s, #16
ushR v24.4s, v24.4s, #16
sMLAL v30.4s, v3.4h, v13.4h
sMLAL v28.4s, v1.4h, v13.4h
sMLAL v26.4s, v3.4h, v12.4h
sMLAL v24.4s, v1.4h, v12.4h
uMULL v22.4s, v6.4h, v9.4h
uMULL v20.4s, v4.4h, v9.4h
ADD v28.4s, v28.4s , v26.4s
SUB v30.4s, v30.4s , v24.4s
NEG v28.4s, v28.4s
uMULL v18.4s, v6.4h, v8.4h
uMULL v16.4s, v4.4h, v8.4h
mov v26.8b, v30.8b
mov v27.D[0], v30.D[1]
ushR v22.4s, v22.4s, #16
mov v24.16b, v28.16b
mov v25.D[0], v28.D[1]
ushR v20.4s, v20.4s, #16
MOV v31.8B, V26.8B
UZP1 v26.4h, v31.4h, v27.4h
UZP2 v27.4h, v31.4h, v27.4h
ushr v18.4s, v18.4s, #16
MOV v31.8B, V24.8B
UZP1 v24.4h, v31.4h, v25.4h
UZP2 v25.4h, v31.4h, v25.4h
ushR v16.4s, v16.4s, #16
sMLAL v22.4s, v7.4h, v9.4h
sMLAL v20.4s, v5.4h, v9.4h
sMLAL v18.4s, v7.4h, v8.4h
sMLAL v16.4s, v5.4h, v8.4h
uMULL v0.4s, v26.4h, v10.4h
uMULL v2.4s, v24.4h, v10.4h
ADD v22.4s, v22.4s , v16.4s
SUB v20.4s, v18.4s , v20.4s
NEG v22.4s, v22.4s
mov v18.8B, v22.8B
mov v19.D[0], v22.D[1]
ushR v0.4s, v0.4s, #16
mov v16.16b, v20.16b
mov v17.D[0], v20.D[1]
ushR v2.4s, v2.4s, #16
MOV v31.8B, V18.8B
UZP1 v18.4h, v31.4h, v19.4h
UZP2 v19.4h, v31.4h, v19.4h
sMLAL v0.4s, v27.4h, v10.4h
MOV v31.8B, V16.8B
UZP1 v16.4h, v31.4h, v17.4h
UZP2 v17.4h, v31.4h, v17.4h
sMLAL v2.4s, v25.4h, v10.4h
uMULL v4.4s, v18.4h, v10.4h
uMULL v6.4s, v16.4h, v10.4h
NEG v0.4s, v0.4s
ADD v14.4s, v30.4s , v2.4s
ADD v26.4s, v28.4s , v0.4s
rev64 v14.4s, v14.4s
ushR v4.4s, v4.4s, #16
swp v14.D[0], v14.D[1]
ushR v6.4s, v6.4s, #16
sMLAL v4.4s, v19.4h, v10.4h
sMLAL v6.4s, v17.4h, v10.4h
ADD v24.4s, v20.4s , v4.4s
rev64 v24.4s, v24.4s
NEG v16.4s, v6.4s
swp v24.D[0], v24.D[1]
ADD v16.4s, v22.4s , v16.4s
MOV v15.16B, v14.16B
ST2 {v15.2s, v16.2s}, [x0], #16
ST2 {v15.s, v16.s}[2], [x0], #8
ST1 {v15.s}[3], [x0]
ADD x7, x7, #4
ST1 {v26.s}[0], [x7], #4
MOV v25.16B, v24.16B
ST2 {v25.s, v26.s}[1], [x7], #8
MOV v27.D[0], V26.d[1]
mov v26.d[0], v25.d[1]
ST2 {v26.2s, v27.2s}, [x7]
pop_v_regs
ret