blob: db365e7d26bc5e8b169d0f6219ab92c1eb88cd99 [file] [log] [blame]
///******************************************************************************
// *
// * Copyright (C) 2018 The Android Open Source Project
// *
// * Licensed under the Apache License, Version 2.0 (the "License");
// * you may not use this file except in compliance with the License.
// * You may obtain a copy of the License at:
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// *****************************************************************************
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.global ixheaacd_over_lap_add2_armv8
ixheaacd_over_lap_add2_armv8:
push_v_regs
MOV X8, X5
SUB X12, X5, #1
LSL X9, X5, #2
LSL X12, X12, #2
ADD X10, X0, X9
ADD X7, X1, X12
ADD X4, X4, #1
LD2 {V0.4H, V1.4H}, [X10], #16
LSL X11, X6, #2
SUB X7, X7, #12
SUB X4, X4, #16
MOV X12, #-16
MOV X13, #1
ADD X14, X4, #1
NEG X14, X14
DUP V21.4S, W4
LD2 {V6.4H, V7.4H}, [X7], X12
LSL X4, X13, X14
REV64 V4.4H, V6.4H
DUP V20.4S, W4
REV64 V5.4H, V7.4H
MOV X4, X3
MOV X9, X2
LD2 {V2.4H, V3.4H}, [X3], #16
UMULL V23.4S, V0.4H, V2.4H
UMLSL V23.4S, V4.4H, V3.4H
LD2 {V8.4H, V9.4H}, [X10], #16
SSHR V23.4S, V23.4S, #16
LD2 {V10.4H, V11.4H}, [X3], #16
SMLAL V23.4S, V1.4H, V2.4H
SMLSL V23.4S, V5.4H, V3.4H
LD2 {V14.4H, V15.4H}, [X7], X12
REV64 V12.4H, V14.4H
REV64 V13.4H, V15.4H
SQADD V22.4S, V23.4S, V20.4S
SSHL V22.4S, V22.4S, V21.4S
MOV V24.16B, V22.16B
SUB X8, X8, #8
LOOP_1:
LD2 {V0.4H, V1.4H}, [X10], #16
UMULL V19.4S, V8.4H, V10.4H
LD2 {V2.4H, V3.4H}, [X3], #16
UMLSL V19.4S, V12.4H, V11.4H
LD2 {V6.4H, V7.4H}, [X7], X12
UMULL V23.4S, V0.4H, V2.4H
REV64 V4.4H, V6.4H
UMLSL V23.4S, V4.4H, V3.4H
REV64 V5.4H, V7.4H
SSHR V19.4S, V19.4S, #16
ST1 {V24.S}[0], [X2], X11
SMLAL V19.4S, V9.4H, V10.4H
ST1 {V24.S}[1], [X2], X11
SSHR V23.4S, V23.4S, #16
ST1 {V24.S}[2], [X2], X11
SMLAL V23.4S, V1.4H, V2.4H
ST1 {V24.S}[3], [X2], X11
SMLSL V19.4S, V13.4H, V11.4H
SMLSL V23.4S, V5.4H, V3.4H
LD2 {V8.4H, V9.4H}, [X10], #16
LD2 {V10.4H, V11.4H}, [X3], #16
LD2 {V14.4H, V15.4H}, [X7], X12
SQADD V18.4S, V19.4S, V20.4S
REV64 V12.4H, V14.4H
REV64 V13.4H, V15.4H
SQADD V22.4S, V23.4S, V20.4S
SSHL V18.4S, V18.4S, V21.4S
MOV V16.16B, V18.16B
ST1 {V16.S}[0], [X2], X11
SSHL V22.4S, V22.4S, V21.4S
MOV V24.16B, V22.16B
SUBS X8, X8, #8
ST1 {V16.S}[1], [X2], X11
ST1 {V16.S}[2], [X2], X11
ST1 {V16.S}[3], [X2], X11
BGT LOOP_1
ST1 {V24.S}[0], [X2], X11
UMULL V19.4S, V8.4H, V10.4H
UMLSL V19.4S, V12.4H, V11.4H
ST1 {V24.S}[1], [X2], X11
ST1 {V24.S}[2], [X2], X11
SSHR V19.4S, V19.4S, #16
ST1 {V24.S}[3], [X2], X11
SMLAL V19.4S, V9.4H, V10.4H
SMLSL V19.4S, V13.4H, V11.4H
MOV X12, #12
MOV V30.S[0], W5
MOV V31.S[0], W6
SMULL V29.4S, V30.4H, V31.4H
MOV W7, V29.S[0]
LSL W10, W5, #1
SQADD V18.4S, V19.4S, V20.4S
SSHL V18.4S, V18.4S, V21.4S
MOV V16.16B, V18.16B
ST1 {V16.S}[0], [X2], X11
LSL X7, X7, #2
ST1 {V16.S}[1], [X2], X11
ADD X7, X7, X9
ST1 {V16.S}[2], [X2], X11
ST1 {V16.S}[3], [X2], X11
SUB X11, X10, #1
LSL X10, X11, #2
ADD X10, X0, X10
LSL X11, X11, #1
SUB X10, X10, X12
LSL X8, X6, #2
MOV X12, #-16
ADD X11, X11, X4
LD1 {V6.4S}, [X10], X12
SUB X11, X11, #14
REV64 V0.4S, V6.4S
SQNEG V0.4S, V0.4S
UZP1 V1.8H, V0.8H, V0.8H
UZP2 V0.8H, V0.8H, V0.8H
REV64 V1.4S, V1.4S
REV64 V0.4S, V0.4S
LD2 {V2.4H, V3.4H}, [X11], X12
REV64 V2.4H, V2.4H
REV64 V3.4H, V3.4H
LD2 {V4.4H, V5.4H}, [X1], #16
UMULL V23.4S, V1.4H, V3.4H
UMLSL V23.4S, V4.4H, V2.4H
SSHR V23.4S, V23.4S, #16
SMLAL V23.4S, V0.4H, V3.4H
SMLSL V23.4S, V5.4H, V2.4H
SQADD V22.4S, V23.4S, V20.4S
SSHL V22.4S, V22.4S, V21.4S
MOV V24.16B, V22.16B
LD1 {V14.4S}, [X10], X12
UMULL V23.4S, V1.4H, V3.4H
UMLSL V23.4S, V4.4H, V2.4H
REV64 V8.4S, V14.4S
SQNEG V8.4S, V8.4S
LD2 {V10.4H, V11.4H}, [X11], X12
SSHR V23.4S, V23.4S, #16
LD2 {V12.4H, V13.4H}, [X1], #16
SMLAL V23.4S, V0.4H, V3.4H
SMLSL V23.4S, V5.4H, V2.4H
UZP1 V9.8H, V8.8H, V8.8H
UZP2 V8.8H, V8.8H, V8.8H
rev64 v9.4s, v9.4s
rev64 v8.4s, v8.4s
REV64 V10.4H, V10.4H
REV64 V11.4H, V11.4H
SQADD V22.4S, V23.4S, V20.4S
SUB X5, X5, #8
SSHL V22.4S, V22.4S, V21.4S
MOV V24.16B, V22.16B
LOOP_2:
LD1 {V6.4S}, [X10], X12
UMULL V19.4S, V9.4H, V11.4H
REV64 V0.4S, V6.4S
SQNEG V0.4S, V0.4S
UZP1 V1.8H, V0.8H, V0.8H
UZP2 V0.8H, V0.8H, V0.8H
REV64 V1.4S, V1.4S
REV64 V0.4S, V0.4S
LD2 {V2.4H, V3.4H}, [X11], X12
REV64 V2.8H, V2.8H
REV64 V3.8H, V3.8H
LD2 {V4.4H, V5.4H}, [X1], #16
UMLSL V19.4S, V12.4H, V10.4H
ST1 {V24.S}[0], [X7], X8
UMULL V23.4S, V1.4H, V3.4H
ST1 {V24.S}[1], [X7], X8
SSHR V19.4S, V19.4S, #16
ST1 {V24.S}[2], [X7], X8
UMLSL V23.4S, V4.4H, V2.4H
ST1 {V24.S}[3], [X7], X8
SMLAL V19.4S, V8.4H, V11.4H
LD1 {V14.4S}, [X10], X12
SSHR V23.4S, V23.4S, #16
SMLSL V19.4S, V13.4H, V10.4H
LD2 {V10.4H, V11.4H}, [X11], X12
SMLAL V23.4S, V0.4H, V3.4H
SMLSL V23.4S, V5.4H, V2.4H
REV64 V8.4S, V14.4S
LD2 {V12.4H, V13.4H}, [X1], #16
SQNEG V8.4S, V8.4S
REV64 V11.4H, V11.4h
REV64 V10.4H, V10.4H
SQADD V18.4S, V19.4S, V20.4S
UZP1 V9.8H, V8.8H, V8.8H
UZP2 V8.8H, V8.8H, V8.8H
rev64 v9.4s, v9.4s
rev64 v8.4s, v8.4s
SQADD V22.4S, V23.4S, V20.4S
SSHL V18.4S, V18.4S, V21.4S
SUBS X5, X5, #8
MOV V16.16B, V18.16B
ST1 {V16.S}[0], [X7], X8
SSHL V22.4S, V22.4S, V21.4S
ST1 {V16.S}[1], [X7], X8
MOV V24.16B, V22.16B
ST1 {V16.S}[2], [X7], X8
ST1 {V16.S}[3], [X7], X8
BGT LOOP_2
ST1 {V24.S}[0], [X7], X8
UMULL V19.4S, V9.4H, V11.4H
UMLSL V19.4S, V12.4H, V10.4H
ST1 {V24.S}[1], [X7], X8
ST1 {V24.S}[2], [X7], X8
SSHR V19.4S, V19.4S, #16
ST1 {V24.S}[3], [X7], X8
SMLAL V19.4S, V8.4H, V11.4H
SMLSL V19.4S, V13.4H, V10.4H
SQADD V18.4S, V19.4S, V20.4S
SSHL V18.4S, V18.4S, V21.4S
MOV V16.16B, V18.16B
ST1 {V16.S}[0], [X7], X8
ST1 {V16.S}[1], [X7], X8
ST1 {V16.S}[2], [X7], X8
ST1 {V16.S}[3], [X7], X8
pop_v_regs
RET