| ///****************************************************************************** |
| // * |
| // * Copyright (C) 2018 The Android Open Source Project |
| // * |
| // * Licensed under the Apache License, Version 2.0 (the "License"); |
| // * you may not use this file except in compliance with the License. |
| // * You may obtain a copy of the License at: |
| // * |
| // * http://www.apache.org/licenses/LICENSE-2.0 |
| // * |
| // * Unless required by applicable law or agreed to in writing, software |
| // * distributed under the License is distributed on an "AS IS" BASIS, |
| // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // * See the License for the specific language governing permissions and |
| // * limitations under the License. |
| // * |
| // ***************************************************************************** |
| // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| |
| .macro push_v_regs |
| stp q8, q9, [sp, #-32]! |
| stp q10, q11, [sp, #-32]! |
| stp q12, q13, [sp, #-32]! |
| stp q14, q15, [sp, #-32]! |
| stp X8, X9, [sp, #-16]! |
| stp X10, X11, [sp, #-16]! |
| stp X12, X13, [sp, #-16]! |
| stp X14, X15, [sp, #-16]! |
| stp X16, X17, [sp, #-16]! |
| stp X29, X30, [sp, #-16]! |
| .endm |
| .macro pop_v_regs |
| ldp X29, X30, [sp], #16 |
| ldp X16, X17, [sp], #16 |
| ldp X14, X15, [sp], #16 |
| ldp X12, X13, [sp], #16 |
| ldp X10, X11, [sp], #16 |
| ldp X8, X9, [sp], #16 |
| ldp q14, q15, [sp], #32 |
| ldp q12, q13, [sp], #32 |
| ldp q10, q11, [sp], #32 |
| ldp q8, q9, [sp], #32 |
| .endm |
| |
| .text |
| .global ixheaacd_over_lap_add1_armv8 |
| ixheaacd_over_lap_add1_armv8: |
| push_v_regs |
| LSL X10, X5, #1 |
| SUB X11, X10, #1 |
| LSL X10, X11, #2 |
| ADD X10, X0, X10 |
| SUB X10, X10, #12 |
| LSL X8, X11, #1 |
| ADD X8, X8, X3 |
| SUB X8, X8, #14 |
| MOV X12, #-16 |
| DUP V11.8H, W4 |
| LD1 {V3.4S}, [X10], X12 |
| MOV W7, #0x2000 |
| |
| NEG W7, W7 |
| SQNEG V0.4S, V3.4S |
| DUP V10.4S, W7 |
| UZP1 V31.8H, V0.8H, V0.8H |
| UZP2 V30.8H, V0.8H, V0.8H |
| REV64 V31.8h, V31.8h |
| REV64 V30.8h, V30.8h |
| SUB X11, X5, #1 |
| UZP1 V7.8H, V3.8H, V3.8H |
| UZP2 V6.8H, V3.8H, V3.8H |
| REV64 V7.8H, V7.8H |
| REV64 V6.8H, V6.8H |
| MOV V16.S[0], W6 |
| MOV V17.S[0], W11 |
| SMULL V17.4S, V16.4H, V17.4H |
| MOV W11, V17.S[0] |
| LSL X11, X11, #1 |
| |
| LD2 {V2.4H, V3.4H}, [X8], X12 |
| ADD X11, X11, X2 |
| REV64 V2.4H, V2.4H |
| REV64 V3.4H, V3.4H |
| LSL X4, X6, #1 |
| NEG X4, X4 |
| LSL X9, X6, #1 |
| MOV V16.S[0], W5 |
| MOV V17.S[0], W6 |
| SMULL V17.4S, V16.4H, V17.4H |
| MOV W6, V17.S[0] |
| LSL W6, W6, #1 |
| ADD X6, X6, X2 |
| |
| UMULL V15.4S, V7.4H, V2.4H |
| LD1 {V4.4S}, [X1], #16 |
| USHR V15.4S, V15.4S, #16 |
| |
| SMLAL V15.4S, V6.4H, V2.4H |
| SQSHL V15.4S, V15.4S, V11.4S |
| SSHLL V27.4S, V3.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V14.16B, V28.16B |
| |
| SQADD V14.4S, V14.4S, V10.4S |
| SQSUB V13.4S, V15.4S, V14.4S |
| SQSHL V13.4S, V13.4S, #2 |
| SSHR V13.4S, V13.4S, #16 |
| UZP1 V26.8H, V13.8H, V13.8H |
| |
| UMULL V12.4S, V31.4H, V3.4H |
| USHR V12.4S, V12.4S, #16 |
| SMLAL V12.4S, V30.4H, V3.4H |
| SQSHL V12.4S, V12.4S, V11.4S |
| LD1 {V3.4S}, [X10], X12 |
| |
| SSHLL V27.4S, V2.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V8.16B, V28.16B |
| |
| SQADD V8.4S, V8.4S, V10.4S |
| |
| SQNEG V0.4S, V3.4S |
| UZP1 V1.8H, V0.8H, V0.8H |
| UZP2 V0.8H, V0.8H, V0.8H |
| REV64 V1.8h, V1.8h |
| REV64 V0.8h, V0.8h |
| SQSUB V9.4S, V12.4S, V8.4S |
| UZP1 V7.8H, V3.8H, V3.8H |
| UZP2 V6.8H, V3.8H, V3.8H |
| REV64 V7.8h, V7.8h |
| REV64 V6.8h, V6.8h |
| SQSHL V9.4S, V9.4S, #2 |
| LD2 {V2.4H, V3.4H}, [X8], X12 |
| SSHR V9.4S, V9.4S, #16 |
| REV64 V2.4H, V2.4H |
| REV64 V3.4H, V3.4H |
| UZP1 V18.8H, V9.8H, V9.8H |
| |
| LD1 {V4.4S}, [X1], #16 |
| SUB W5, W5, #8 |
| |
| |
| LOOP_1: |
| |
| ST1 {V26.H}[0], [X11], X4 |
| UMULL V15.4S, V7.4H, V2.4H |
| ST1 {V26.H}[1], [X11], X4 |
| UMULL V12.4S, V1.4H, V3.4H |
| ST1 {V26.H}[2], [X11], X4 |
| USHR V15.4S, V15.4S, #16 |
| ST1 {V26.H}[3], [X11], X4 |
| USHR V12.4S, V12.4S, #16 |
| ST1 {V18.H}[0], [X6], X9 |
| SMLAL V15.4S, V6.4H, V2.4H |
| ST1 {V18.H}[1], [X6], X9 |
| SMLAL V12.4S, V0.4H, V3.4H |
| ST1 {V18.H}[2], [X6], X9 |
| SQSHL V15.4S, V15.4S, V11.4S |
| ST1 {V18.H}[3], [X6], X9 |
| SQSHL V12.4S, V12.4S, V11.4S |
| LD1 {V6.4S}, [X10], X12 |
| |
| SSHLL V27.4S, V3.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V14.16B, V28.16B |
| |
| SSHLL V27.4S, V2.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V8.16B, V28.16B |
| |
| LD2 {V2.4H, V3.4H}, [X8], X12 |
| |
| SQNEG V0.4S, V6.4S |
| |
| LD1 {V4.4S}, [X1], #16 |
| |
| SQADD V14.4S, V14.4S, V10.4S |
| UZP1 V1.8H, V0.8H, V0.8H |
| UZP2 V0.8H, V0.8H, V0.8H |
| REV64 V1.8h, V1.8h |
| REV64 V0.8h, V0.8h |
| SQADD V8.4S, V8.4S, V10.4S |
| UZP1 V7.8H, V6.8H, V6.8H |
| UZP2 V6.8H, V6.8H, V6.8H |
| REV64 V7.8h, V7.8h |
| REV64 V6.8h, V6.8h |
| SQSUB V13.4S, V15.4S, V14.4S |
| REV64 V2.4H, V2.4H |
| REV64 V3.4H, V3.4H |
| SQSUB V9.4S, V12.4S, V8.4S |
| SQSHL V13.4S, V13.4S, #2 |
| SQSHL V9.4S, V9.4S, #2 |
| UMULL V15.4S, V7.4H, V2.4H |
| SSHR V13.4S, V13.4S, #16 |
| UZP1 V26.8H, V13.8H, V13.8H |
| SSHR V9.4S, V9.4S, #16 |
| ST1 {V26.H}[0], [X11], X4 |
| UMULL V12.4S, V1.4H, V3.4H |
| UZP1 V18.8H, V9.8H, V9.8H |
| USHR V15.4S, V15.4S, #16 |
| ST1 {V26.H}[1], [X11], X4 |
| SMLAL V15.4S, V6.4H, V2.4H |
| ST1 {V26.H}[2], [X11], X4 |
| USHR V12.4S, V12.4S, #16 |
| ST1 {V26.H}[3], [X11], X4 |
| SMLAL V12.4S, V0.4H, V3.4H |
| ST1 {V18.H}[0], [X6], X9 |
| SQSHL V15.4S, V15.4S, V11.4S |
| ST1 {V18.H}[1], [X6], X9 |
| SQSHL V12.4S, V12.4S, V11.4S |
| ST1 {V18.H}[2], [X6], X9 |
| |
| SSHLL V27.4S, V3.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V14.16B, V28.16B |
| |
| ST1 {V18.H}[3], [X6], X9 |
| |
| |
| SSHLL V27.4S, V2.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V8.16B, V28.16B |
| |
| LD1 {V3.4S}, [X10], X12 |
| SQADD V14.4S, V14.4S, V10.4S |
| |
| SQNEG V0.4S, V3.4S |
| UZP1 V1.8H, V0.8H, V0.8H |
| UZP2 V0.8H, V0.8H, V0.8H |
| REV64 V1.8H, V1.8H |
| REV64 V0.8H, V0.8H |
| SQSUB V13.4S, V15.4S, V14.4S |
| UZP1 V7.8H, V3.8H, V3.8H |
| UZP2 V6.8H, V3.8H, V3.8H |
| REV64 V7.8H, V7.8H |
| REV64 V6.8H, V6.8H |
| SQADD V8.4S, V8.4S, V10.4S |
| LD2 {V2.4H, V3.4H}, [X8], X12 |
| SQSUB V9.4S, V12.4S, V8.4S |
| REV64 V2.4H, V2.4H |
| REV64 V3.4H, V3.4H |
| SQSHL V13.4S, V13.4S, #2 |
| LD1 {V4.4S}, [X1], #16 |
| |
| SQSHL V9.4S, V9.4S, #2 |
| SSHR V13.4S, V13.4S, #16 |
| SUBS X5, X5, #8 |
| SSHR V9.4S, V9.4S, #16 |
| UZP1 V26.8H, V13.8H, V13.8H |
| UZP1 V18.8H, V9.8H, V9.8H |
| |
| BGT LOOP_1 |
| |
| ST1 {V26.H}[0], [X11], X4 |
| UMULL V15.4S, V7.4H, V2.4H |
| ST1 {V26.H}[1], [X11], X4 |
| UMULL V12.4s, V1.4H, V3.4H |
| ST1 {V26.H}[2], [X11], X4 |
| USHR V15.4S, V15.4S, #16 |
| ST1 {V26.H}[3], [X11], X4 |
| USHR V12.4S, V12.4S, #16 |
| |
| ST1 {V18.H}[0], [X6], X9 |
| SMLAL V15.4S, V6.4H, V2.4H |
| ST1 {V18.H}[1], [X6], X9 |
| SMLAL V12.4S, V0.4H, V3.4H |
| ST1 {V18.H}[2], [X6], X9 |
| SQSHL V15.4S, V15.4S, V11.4S |
| ST1 {V18.H}[3], [X6], X9 |
| SQSHL V12.4S, V12.4S, V11.4S |
| |
| |
| SSHLL V27.4S, V3.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V14.16B, V28.16B |
| |
| SSHLL V27.4S, V2.4H, #0 |
| SMULL V28.2D, V27.2S, V4.2S |
| SMULL2 V29.2D, V27.4S, V4.4S |
| SQXTN V28.2S, V28.2D |
| SQXTN2 V28.4S, V29.2D |
| MOV V8.16B, V28.16B |
| |
| SQADD V14.4S, V14.4S, V10.4S |
| SQADD V8.4S, V8.4S, V10.4S |
| SQSUB V13.4S, V15.4S, V14.4S |
| SQSUB V9.4S, V12.4S, V8.4S |
| SQSHL V13.4S, V13.4S, #2 |
| SQSHL V9.4S, V9.4S, #2 |
| SSHR V13.4S, V13.4S, #16 |
| SSHR V9.4S, V9.4S, #16 |
| UZP1 V26.8H, V13.8H, V13.8H |
| |
| UZP1 V18.8H, V9.8H, V9.8H |
| |
| |
| ST1 {V26.H}[0], [X11], X4 |
| ST1 {V26.H}[1], [X11], X4 |
| ST1 {V26.H}[2], [X11], X4 |
| ST1 {V26.H}[3], [X11], X4 |
| |
| ST1 {V18.H}[0], [X6], X9 |
| ST1 {V18.H}[1], [X6], X9 |
| ST1 {V18.H}[2], [X6], X9 |
| ST1 {V18.H}[3], [X6], X9 |
| pop_v_regs |
| RET |
| |
| |
| |
| |