| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ihevc_itrans_recon_8x8_neon.s |
| // * |
| // * @brief |
| // * contains function definitions for single stage inverse transform |
| // * |
| // * @author |
| // * anand s |
| // * |
| // * @par list of functions: |
| // * - ihevc_itrans_recon_32x32() |
| // * |
| // * @remarks |
| // * the input buffer is being corrupted |
| // * |
| // ******************************************************************************* |
| //*/ |
| |
| ///** |
| // ******************************************************************************* |
| // * |
| // * @brief |
| // * this function performs inverse transform and reconstruction for 8x8 |
| // * input block |
| // * |
| // * @par description: |
| // * performs inverse transform and adds the prediction data and clips output |
| // * to 8 bit |
| // * |
| // * @param[in] pi2_src |
| // * input 16x16 coefficients |
| // * |
| // * @param[in] pi2_tmp |
| // * temporary 16x16 buffer for storing inverse |
| // * |
| // * transform |
| // * 1st stage output |
| // * |
| // * @param[in] pu1_pred |
| // * prediction 16x16 block |
| // * |
| // * @param[out] pu1_dst |
| // * output 8x8 block |
| // * |
| // * @param[in] src_strd |
| // * input stride |
| // * |
| // * @param[in] pred_strd |
| // * prediction stride |
| // * |
| // * @param[in] dst_strd |
| // * output stride |
| // * |
| // * @param[in] shift |
| // * output shift |
| // * |
| // * @param[in] x12 |
| // * zero columns in pi2_src |
| // * |
| // * @returns void |
| // * |
| // * @remarks |
| // * none |
| // * |
| // ******************************************************************************* |
| // */ |
| |
| //void ihevc_itrans_recon_32x32(word16 *pi2_src, |
| // word16 *pi2_tmp, |
| // uword8 *pu1_pred, |
| // uword8 *pu1_dst, |
| // word32 src_strd, |
| // word32 pred_strd, |
| // word32 dst_strd, |
| // word32 x12 |
| // word32 x11 ) |
| |
| //**************variables vs registers************************* |
| // x0 => *pi2_src |
| // x1 => *pi2_tmp |
| // x2 => *pu1_pred |
| // x3 => *pu1_dst |
| // src_strd |
| // pred_strd |
| // dst_strd |
| // x12 |
| // x11 |
| |
| |
| //d0[0]= 64 d2[0]=83 |
| //d0[1]= 90 d2[1]=82 |
| //d0[2]= 90 d2[2]=80 |
| //d0[3]= 90 d2[3]=78 |
| //d1[0]= 89 d3[0]=75 |
| //d1[1]= 88 d3[1]=73 |
| //d1[2]= 87 d3[2]=70 |
| //d1[3]= 85 d3[3]=67 |
| |
| //d4[0]= 64 d6[0]=36 |
| //d4[1]= 61 d6[1]=31 |
| //d4[2]= 57 d6[2]=25 |
| //d4[3]= 54 d6[3]=22 |
| //d5[0]= 50 d7[0]=18 |
| //d5[1]= 46 d7[1]=13 |
| //d5[2]= 43 d7[2]=9 |
| //d5[3]= 38 d7[3]=4 |
| |
| .text |
| .align 4 |
| .include "ihevc_neon_macros.s" |
| |
| |
| |
| |
| .set shift_stage1_idct , 7 |
| .set shift_stage2_idct , 12 |
| |
| //#define zero_cols x12 |
| //#define zero_rows x11 |
| |
| .globl ihevc_itrans_recon_32x32_av8 |
| |
| .extern g_ai2_ihevc_trans_32_transpose |
| |
| x5_addr: .word 0xfffff000 |
| x9_addr: .word 0xffff0000 |
| |
| .type ihevc_itrans_recon_32x32_av8, %function |
| |
| ihevc_itrans_recon_32x32_av8: |
| |
| ldr w11, [sp] |
| |
| // stmfd sp!,{x0-x12,x14} |
| push_v_regs |
| stp x19, x20,[sp,#-16]! |
| stp x0, x1,[sp,#-16]! |
| stp x5, x6,[sp,#-16]! |
| |
| //ldr x8,[sp,#56] @ prediction stride |
| //ldr x7,[sp,#64] @ destination stride |
| mov x6, x4 // src stride |
| mov x12, x7 |
| lsl x6, x6, #1 // x sizeof(word16) |
| add x10,x6,x6, lsl #1 // 3 rows |
| |
| |
| mov x8,x0 |
| |
| adrp x14, :got:g_ai2_ihevc_trans_32_transpose |
| ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose] |
| |
| ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32 |
| ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32 |
| |
| //registers which are free |
| // x10,x9,x11,x12 |
| mov x9,#0xffffff00 |
| mov x10,#0xfffffff0 |
| ldr w5, x5_addr |
| ldr w7, x9_addr |
| cmp x12,x10 |
| mov x20,#1 |
| csel x14, x20, x14,hs |
| bhs stage1 |
| |
| |
| cmp x12,x9 |
| mov x20,#2 |
| csel x14, x20, x14,hs |
| bhs stage1 |
| |
| cmp x12,x5 |
| mov x20,#3 |
| csel x14, x20, x14,hs |
| bhs stage1 |
| |
| cmp x12,x7 |
| mov x20,#4 |
| csel x14, x20, x14,hs |
| |
| mov x14,#8 |
| b stage1 |
| //.ltorg |
| |
| |
| dct_stage1: |
| add x8,x8,#8 |
| mov x0,x8 |
| |
| stage1: |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v0.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v1.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| cmp x11,x10 |
| bhs shift1 |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v1.h[1] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlal v28.4s, v14.4h, v6.h[1] |
| smlsl v30.4s, v14.4h, v7.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlal v26.4s, v15.4h, v5.h[1] |
| smlsl v28.4s, v15.4h, v7.h[1] |
| smlsl v30.4s, v15.4h, v3.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v1.h[2] |
| smlal v22.4s, v12.4h, v3.h[0] |
| smlal v22.4s, v13.4h, v4.h[2] |
| smlal v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v7.h[2] |
| smlal v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v5.h[2] |
| |
| cmp x11,x9 |
| bhs shift1 |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v2.h[0] |
| smlal v20.4s, v11.4h, v2.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v7.h[2] |
| |
| smlsl v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v3.h[2] |
| |
| smlsl v18.4s, v10.4h, v2.h[0] |
| smlsl v18.4s, v11.4h, v1.h[2] |
| |
| cmp x11,x5 |
| bhs shift1 |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v3.h[1] |
| smlsl v26.4s, v14.4h, v6.h[1] |
| smlsl v28.4s, v14.4h, v0.h[1] |
| smlsl v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v3.h[3] |
| smlsl v26.4s, v15.4h, v4.h[3] |
| smlsl v28.4s, v15.4h, v2.h[3] |
| smlal v30.4s, v15.4h, v5.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v3.h[0] |
| smlal v20.4s, v13.4h, v3.h[2] |
| smlsl v22.4s, v12.4h, v7.h[0] |
| smlsl v22.4s, v13.4h, v5.h[2] |
| smlsl v16.4s, v12.4h, v1.h[0] |
| smlsl v16.4s, v13.4h, v1.h[2] |
| smlsl v18.4s, v12.4h, v5.h[0] |
| smlal v18.4s, v13.4h, v7.h[2] |
| |
| cmp x11,x7 |
| bhs shift1 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v4.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v2.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v6.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v0.h[2] |
| |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v5.h[1] |
| smlsl v26.4s, v14.4h, v0.h[2] |
| smlal v28.4s, v14.4h, v5.h[3] |
| smlal v30.4s, v14.4h, v4.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v5.h[3] |
| smlsl v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v3.h[1] |
| smlsl v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v5.h[2] |
| smlsl v22.4s, v12.4h, v1.h[0] |
| smlsl v22.4s, v13.4h, v0.h[2] |
| smlal v16.4s, v12.4h, v7.h[0] |
| smlal v16.4s, v13.4h, v4.h[2] |
| smlal v18.4s, v12.4h, v3.h[0] |
| smlal v18.4s, v13.4h, v6.h[2] |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v6.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v2.h[0] |
| smlsl v22.4s, v11.4h, v3.h[2] |
| |
| smlal v16.4s, v10.4h, v2.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| |
| smlsl v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| smlal v24.4s, v14.4h, v7.h[1] |
| smlsl v26.4s, v14.4h, v5.h[3] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v7.h[3] |
| smlsl v26.4s, v15.4h, v7.h[1] |
| smlal v28.4s, v15.4h, v6.h[3] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v7.h[2] |
| smlsl v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v6.h[2] |
| smlal v16.4s, v12.4h, v3.h[0] |
| smlal v16.4s, v13.4h, v5.h[2] |
| smlsl v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v4.h[2] |
| |
| |
| |
| shift1: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| |
| // registers used q15,q14,q6,q7 |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| // d30 =x0 1- 4 values |
| // d31 =x2 1- 4 values |
| // d12=x1 1- 4 values |
| // d13=x3 1- 4 values |
| // d14 =x0 28-31 values |
| // d15 =x2 28- 31 values |
| // d18=x1 28- 31 values |
| // d19=x3 28- 31 values |
| |
| |
| |
| st1 { v30.4h, v31.4h},[x1],#16 |
| st1 { v12.4h, v13.4h},[x1],#16 |
| add x1,x1,#192 |
| st1 { v14.4h, v15.4h},[x1],#16 |
| st1 { v18.4h, v19.4h},[x1],#16 |
| sub x1,x1,#224 |
| |
| mov x0,x8 |
| |
| |
| |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v4.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v5.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v6.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v7.h[2] |
| cmp x11,x10 |
| bhs shift2 |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| smlsl v24.4s, v14.4h, v4.h[3] |
| smlsl v26.4s, v14.4h, v2.h[1] |
| smlsl v28.4s, v14.4h, v0.h[1] |
| smlsl v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v0.h[3] |
| smlsl v26.4s, v15.4h, v3.h[1] |
| smlsl v28.4s, v15.4h, v6.h[3] |
| smlal v30.4s, v15.4h, v5.h[3] |
| |
| |
| smlsl v20.4s, v12.4h, v7.h[0] |
| smlsl v20.4s, v13.4h, v2.h[2] |
| smlsl v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v0.h[2] |
| smlsl v16.4s, v12.4h, v3.h[0] |
| smlsl v16.4s, v13.4h, v3.h[2] |
| smlsl v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v6.h[2] |
| |
| cmp x11,x9 |
| bhs shift2 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| |
| smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v2.h[0] |
| smlsl v20.4s, v11.4h, v6.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v4.h[2] |
| |
| smlal v16.4s, v10.4h, v6.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| |
| smlal v18.4s, v10.4h, v2.h[0] |
| smlal v18.4s, v11.4h, v5.h[2] |
| |
| cmp x11,x5 |
| bhs shift2 |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v2.h[3] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v5.h[3] |
| smlsl v30.4s, v14.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v6.h[3] |
| smlsl v28.4s, v15.4h, v0.h[3] |
| smlal v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v0.h[2] |
| smlal v22.4s, v12.4h, v1.h[0] |
| smlal v22.4s, v13.4h, v6.h[2] |
| smlal v16.4s, v12.4h, v7.h[0] |
| smlsl v16.4s, v13.4h, v2.h[2] |
| smlsl v18.4s, v12.4h, v3.h[0] |
| smlsl v18.4s, v13.4h, v4.h[2] |
| |
| |
| cmp x11,x7 |
| bhs shift2 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v7.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v1.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v5.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| smlsl v24.4s, v14.4h, v0.h[1] |
| smlal v26.4s, v14.4h, v6.h[1] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v1.h[1] |
| |
| |
| smlsl v24.4s, v15.4h, v3.h[3] |
| smlal v26.4s, v15.4h, v0.h[1] |
| smlsl v28.4s, v15.4h, v5.h[1] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v3.h[0] |
| smlsl v20.4s, v13.4h, v1.h[2] |
| smlsl v22.4s, v12.4h, v7.h[0] |
| smlal v22.4s, v13.4h, v3.h[2] |
| smlal v16.4s, v12.4h, v1.h[0] |
| smlal v16.4s, v13.4h, v7.h[2] |
| smlsl v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v2.h[2] |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v5.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v2.h[0] |
| smlal v22.4s, v11.4h, v7.h[2] |
| |
| smlsl v16.4s, v10.4h, v2.h[0] |
| smlsl v16.4s, v11.4h, v4.h[2] |
| |
| smlal v18.4s, v10.4h, v6.h[0] |
| smlal v18.4s, v11.4h, v1.h[2] |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v1.h[1] |
| smlsl v26.4s, v14.4h, v0.h[3] |
| smlal v28.4s, v14.4h, v1.h[3] |
| smlsl v30.4s, v14.4h, v3.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v5.h[3] |
| smlsl v26.4s, v15.4h, v5.h[1] |
| smlal v28.4s, v15.4h, v4.h[3] |
| smlsl v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v3.h[2] |
| smlsl v22.4s, v12.4h, v3.h[0] |
| smlsl v22.4s, v13.4h, v2.h[2] |
| smlal v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v1.h[2] |
| smlsl v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v0.h[2] |
| |
| shift2: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x1],#16 |
| st1 { v12.4h, v13.4h},[x1],#16 |
| add x1,x1,#128 |
| st1 { v14.4h, v15.4h},[x1],#16 |
| st1 { v18.4h, v19.4h},[x1],#16 |
| sub x1,x1,#160 |
| mov x0,x8 |
| |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v7.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v6.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v5.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v4.h[2] |
| |
| cmp x11,x10 |
| bhs shift3 |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v5.h[1] |
| smlsl v26.4s, v14.4h, v7.h[3] |
| smlal v28.4s, v14.4h, v5.h[3] |
| smlal v30.4s, v14.4h, v3.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v2.h[1] |
| smlal v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v4.h[3] |
| smlsl v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlsl v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v6.h[2] |
| smlsl v22.4s, v12.4h, v3.h[0] |
| smlal v22.4s, v13.4h, v3.h[2] |
| smlsl v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v0.h[2] |
| smlsl v18.4s, v12.4h, v7.h[0] |
| smlal v18.4s, v13.4h, v2.h[2] |
| |
| cmp x11,x9 |
| bhs shift3 |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v2.h[0] |
| smlsl v20.4s, v11.4h, v5.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v6.h[0] |
| smlsl v22.4s, v11.4h, v0.h[2] |
| |
| smlsl v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v4.h[2] |
| |
| smlsl v18.4s, v10.4h, v2.h[0] |
| smlal v18.4s, v11.4h, v6.h[2] |
| |
| cmp x11,x5 |
| bhs shift3 |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v7.h[1] |
| smlal v26.4s, v14.4h, v2.h[1] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v5.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v0.h[3] |
| smlal v26.4s, v15.4h, v7.h[1] |
| smlsl v28.4s, v15.4h, v1.h[1] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v3.h[0] |
| smlal v20.4s, v13.4h, v4.h[2] |
| smlal v22.4s, v12.4h, v7.h[0] |
| smlal v22.4s, v13.4h, v2.h[2] |
| smlal v16.4s, v12.4h, v1.h[0] |
| smlsl v16.4s, v13.4h, v6.h[2] |
| smlal v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v0.h[2] |
| |
| |
| cmp x11,x7 |
| bhs shift3 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v5.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v1.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v7.h[2] |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| smlal v24.4s, v14.4h, v6.h[3] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v1.h[3] |
| smlal v30.4s, v14.4h, v7.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v2.h[3] |
| smlal v28.4s, v15.4h, v7.h[1] |
| smlal v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v2.h[2] |
| smlal v22.4s, v12.4h, v1.h[0] |
| smlsl v22.4s, v13.4h, v7.h[2] |
| smlsl v16.4s, v12.4h, v7.h[0] |
| smlsl v16.4s, v13.4h, v3.h[2] |
| smlsl v18.4s, v12.4h, v3.h[0] |
| smlal v18.4s, v13.4h, v1.h[2] |
| |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v6.h[0] |
| smlsl v20.4s, v11.4h, v1.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v2.h[0] |
| smlal v22.4s, v11.4h, v4.h[2] |
| |
| smlal v16.4s, v10.4h, v2.h[0] |
| smlsl v16.4s, v11.4h, v7.h[2] |
| |
| smlsl v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v5.h[2] |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| smlal v24.4s, v14.4h, v4.h[3] |
| smlsl v26.4s, v14.4h, v6.h[1] |
| smlal v28.4s, v14.4h, v7.h[3] |
| smlal v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v3.h[3] |
| smlsl v26.4s, v15.4h, v3.h[1] |
| smlal v28.4s, v15.4h, v2.h[3] |
| smlsl v30.4s, v15.4h, v2.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v0.h[2] |
| smlal v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v1.h[2] |
| smlsl v16.4s, v12.4h, v3.h[0] |
| smlal v16.4s, v13.4h, v2.h[2] |
| smlal v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v3.h[2] |
| |
| shift3: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| st1 { v30.4h, v31.4h},[x1],#16 |
| st1 { v12.4h, v13.4h},[x1],#16 |
| add x1,x1,#64 |
| st1 { v14.4h, v15.4h},[x1],#16 |
| st1 { v18.4h, v19.4h},[x1],#16 |
| sub x1,x1,#96 |
| |
| mov x0,x8 |
| |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v2.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v1.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v0.h[2] |
| |
| cmp x11,x10 |
| bhs shift4 |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v0.h[1] |
| smlal v26.4s, v14.4h, v1.h[3] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlal v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v4.h[1] |
| smlsl v26.4s, v15.4h, v0.h[3] |
| smlsl v28.4s, v15.4h, v2.h[3] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v5.h[2] |
| smlal v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v7.h[2] |
| smlal v16.4s, v12.4h, v3.h[0] |
| smlsl v16.4s, v13.4h, v4.h[2] |
| smlal v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v1.h[2] |
| |
| cmp x11,x9 |
| bhs shift4 |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v2.h[0] |
| smlal v20.4s, v11.4h, v1.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v3.h[2] |
| |
| smlal v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v7.h[2] |
| |
| smlal v18.4s, v10.4h, v2.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| |
| cmp x11,x5 |
| bhs shift4 |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v1.h[1] |
| smlsl v26.4s, v14.4h, v7.h[3] |
| smlal v28.4s, v14.4h, v1.h[3] |
| smlal v30.4s, v14.4h, v4.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v2.h[1] |
| smlal v26.4s, v15.4h, v5.h[1] |
| smlsl v28.4s, v15.4h, v3.h[1] |
| smlsl v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v5.h[0] |
| smlsl v20.4s, v13.4h, v7.h[2] |
| smlsl v22.4s, v12.4h, v1.h[0] |
| smlal v22.4s, v13.4h, v1.h[2] |
| smlsl v16.4s, v12.4h, v7.h[0] |
| smlal v16.4s, v13.4h, v5.h[2] |
| smlal v18.4s, v12.4h, v3.h[0] |
| smlsl v18.4s, v13.4h, v3.h[2] |
| |
| cmp x11,x7 |
| bhs shift4 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v0.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v6.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v4.h[2] |
| |
| |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v3.h[1] |
| smlsl v26.4s, v14.4h, v2.h[1] |
| smlal v28.4s, v14.4h, v7.h[3] |
| smlal v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v0.h[3] |
| smlal v26.4s, v15.4h, v4.h[3] |
| smlal v28.4s, v15.4h, v6.h[3] |
| smlsl v30.4s, v15.4h, v2.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v3.h[0] |
| smlsl v20.4s, v13.4h, v6.h[2] |
| smlal v22.4s, v12.4h, v7.h[0] |
| smlsl v22.4s, v13.4h, v4.h[2] |
| smlsl v16.4s, v12.4h, v1.h[0] |
| smlal v16.4s, v13.4h, v0.h[2] |
| smlal v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v5.h[2] |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v8.4h},[x0],x6 |
| ld1 {v11.4h},[x0],x6 |
| ld1 {v9.4h},[x0],x6 |
| |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v2.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v2.h[0] |
| smlsl v22.4s, v11.4h, v0.h[2] |
| |
| smlsl v16.4s, v10.4h, v2.h[0] |
| smlal v16.4s, v11.4h, v3.h[2] |
| |
| smlal v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v6.h[2] |
| |
| |
| ld1 {v12.4h},[x0],x6 |
| ld1 {v14.4h},[x0],x6 |
| ld1 {v13.4h},[x0],x6 |
| ld1 {v15.4h},[x0],x6 |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v5.h[1] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v2.h[1] |
| smlal v30.4s, v14.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v0.h[3] |
| smlsl v30.4s, v15.4h, v0.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v4.h[2] |
| smlal v22.4s, v12.4h, v3.h[0] |
| smlsl v22.4s, v13.4h, v5.h[2] |
| smlsl v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v6.h[2] |
| smlal v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v7.h[2] |
| |
| shift4: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x1],#16 |
| st1 { v12.4h, v13.4h},[x1],#16 |
| st1 { v14.4h, v15.4h},[x1],#16 |
| st1 { v18.4h, v19.4h},[x1],#16 |
| |
| add x1,x1,#96 |
| |
| subs x14,x14,#1 |
| bne dct_stage1 |
| second_stage_dct: |
| // mov x0,x1 |
| ldp x8, x7,[sp],#16 |
| ldp x0, x1,[sp],#16 |
| |
| // add x4,x2,x8, lsl #1 @ x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data |
| // add x5,x8,x8, lsl #1 @ |
| // sub x0,x0,#512 |
| mov x11,#0xfffffff0 |
| mov x5, #0xffffff00 |
| ldr w6, x5_addr |
| ldr w9, x9_addr |
| // sub x1,x1,#2048 |
| mov x4,x1 |
| mov x10,#240 |
| mov x14,#8 |
| b stage2 |
| |
| // registers free : |
| |
| // arm registers used |
| // x8 : predicition stride |
| // x7 : destination stride |
| // x1: temp buffer |
| // x2 : pred buffer |
| // x3 : destination buffer |
| // x14 : loop counter |
| //x0 : scratch buffer |
| //x10 : used as stride |
| // x4 : used to store the initial address |
| //x12 : zero cols |
| // x11 : 0xfffffff0 |
| // x5 : 0xffffff00 |
| dct_stage2: |
| add x4,x4,#32 |
| mov x1,x4 |
| stage2: |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v0.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v1.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| cmp x12,x11 |
| bhs stage2_shift1 |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v1.h[1] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlal v28.4s, v14.4h, v6.h[1] |
| smlsl v30.4s, v14.4h, v7.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlal v26.4s, v15.4h, v5.h[1] |
| smlsl v28.4s, v15.4h, v7.h[1] |
| smlsl v30.4s, v15.4h, v3.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v1.h[2] |
| smlal v22.4s, v12.4h, v3.h[0] |
| smlal v22.4s, v13.4h, v4.h[2] |
| smlal v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v7.h[2] |
| smlal v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v5.h[2] |
| cmp x12,x5 |
| bhs stage2_shift1 |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v2.h[0] |
| smlal v20.4s, v11.4h, v2.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v7.h[2] |
| |
| smlsl v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v3.h[2] |
| |
| smlsl v18.4s, v10.4h, v2.h[0] |
| smlsl v18.4s, v11.4h, v1.h[2] |
| |
| cmp x12,x6 |
| bhs stage2_shift1 |
| |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v3.h[1] |
| smlsl v26.4s, v14.4h, v6.h[1] |
| smlsl v28.4s, v14.4h, v0.h[1] |
| smlsl v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v3.h[3] |
| smlsl v26.4s, v15.4h, v4.h[3] |
| smlsl v28.4s, v15.4h, v2.h[3] |
| smlal v30.4s, v15.4h, v5.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v3.h[0] |
| smlal v20.4s, v13.4h, v3.h[2] |
| smlsl v22.4s, v12.4h, v7.h[0] |
| smlsl v22.4s, v13.4h, v5.h[2] |
| smlsl v16.4s, v12.4h, v1.h[0] |
| smlsl v16.4s, v13.4h, v1.h[2] |
| smlsl v18.4s, v12.4h, v5.h[0] |
| smlal v18.4s, v13.4h, v7.h[2] |
| |
| cmp x12,x9 |
| bhs stage2_shift1 |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v4.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v2.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v6.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v0.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v5.h[1] |
| smlsl v26.4s, v14.4h, v0.h[2] |
| smlal v28.4s, v14.4h, v5.h[3] |
| smlal v30.4s, v14.4h, v4.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v5.h[3] |
| smlsl v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v3.h[1] |
| smlsl v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v5.h[2] |
| smlsl v22.4s, v12.4h, v1.h[0] |
| smlsl v22.4s, v13.4h, v0.h[2] |
| smlal v16.4s, v12.4h, v7.h[0] |
| smlal v16.4s, v13.4h, v4.h[2] |
| smlal v18.4s, v12.4h, v3.h[0] |
| smlal v18.4s, v13.4h, v6.h[2] |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v6.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v2.h[0] |
| smlsl v22.4s, v11.4h, v3.h[2] |
| |
| smlal v16.4s, v10.4h, v2.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| |
| smlsl v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| smlal v24.4s, v14.4h, v7.h[1] |
| smlsl v26.4s, v14.4h, v5.h[3] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v7.h[3] |
| smlsl v26.4s, v15.4h, v7.h[1] |
| smlal v28.4s, v15.4h, v6.h[3] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v7.h[2] |
| smlsl v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v6.h[2] |
| smlal v16.4s, v12.4h, v3.h[0] |
| smlal v16.4s, v13.4h, v5.h[2] |
| smlsl v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v4.h[2] |
| |
| stage2_shift1: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) |
| |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x0],#16 |
| st1 { v12.4h, v13.4h},[x0],#16 |
| st1 { v14.4h, v15.4h},[x0],#16 |
| st1 { v18.4h, v19.4h},[x0],#16 |
| |
| mov x1,x4 |
| |
| |
| |
| |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlal v20.4s, v11.4h, v4.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v5.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v6.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v7.h[2] |
| |
| cmp x12,x11 |
| bhs stage2_shift2 |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| smlsl v24.4s, v14.4h, v4.h[3] |
| smlsl v26.4s, v14.4h, v2.h[1] |
| smlsl v28.4s, v14.4h, v0.h[1] |
| smlsl v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v0.h[3] |
| smlsl v26.4s, v15.4h, v3.h[1] |
| smlsl v28.4s, v15.4h, v6.h[3] |
| smlal v30.4s, v15.4h, v5.h[3] |
| |
| |
| smlsl v20.4s, v12.4h, v7.h[0] |
| smlsl v20.4s, v13.4h, v2.h[2] |
| smlsl v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v0.h[2] |
| smlsl v16.4s, v12.4h, v3.h[0] |
| smlsl v16.4s, v13.4h, v3.h[2] |
| smlsl v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v6.h[2] |
| |
| cmp x12,x5 |
| bhs stage2_shift2 |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| |
| |
| smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v2.h[0] |
| smlsl v20.4s, v11.4h, v6.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v4.h[2] |
| |
| smlal v16.4s, v10.4h, v6.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| |
| smlal v18.4s, v10.4h, v2.h[0] |
| smlal v18.4s, v11.4h, v5.h[2] |
| |
| cmp x12,x6 |
| bhs stage2_shift2 |
| |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v2.h[3] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v5.h[3] |
| smlsl v30.4s, v14.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v6.h[3] |
| smlsl v28.4s, v15.4h, v0.h[3] |
| smlal v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlal v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v0.h[2] |
| smlal v22.4s, v12.4h, v1.h[0] |
| smlal v22.4s, v13.4h, v6.h[2] |
| smlal v16.4s, v12.4h, v7.h[0] |
| smlsl v16.4s, v13.4h, v2.h[2] |
| smlsl v18.4s, v12.4h, v3.h[0] |
| smlsl v18.4s, v13.4h, v4.h[2] |
| |
| cmp x12,x9 |
| bhs stage2_shift2 |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v7.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v1.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v5.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v0.h[1] |
| smlal v26.4s, v14.4h, v6.h[1] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v1.h[1] |
| |
| |
| smlsl v24.4s, v15.4h, v3.h[3] |
| smlal v26.4s, v15.4h, v0.h[1] |
| smlsl v28.4s, v15.4h, v5.h[1] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v3.h[0] |
| smlsl v20.4s, v13.4h, v1.h[2] |
| smlsl v22.4s, v12.4h, v7.h[0] |
| smlal v22.4s, v13.4h, v3.h[2] |
| smlal v16.4s, v12.4h, v1.h[0] |
| smlal v16.4s, v13.4h, v7.h[2] |
| smlsl v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v2.h[2] |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v5.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v2.h[0] |
| smlal v22.4s, v11.4h, v7.h[2] |
| |
| smlsl v16.4s, v10.4h, v2.h[0] |
| smlsl v16.4s, v11.4h, v4.h[2] |
| |
| smlal v18.4s, v10.4h, v6.h[0] |
| smlal v18.4s, v11.4h, v1.h[2] |
| |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| smlal v24.4s, v14.4h, v1.h[1] |
| smlsl v26.4s, v14.4h, v0.h[3] |
| smlal v28.4s, v14.4h, v1.h[3] |
| smlsl v30.4s, v14.4h, v3.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v5.h[3] |
| smlsl v26.4s, v15.4h, v5.h[1] |
| smlal v28.4s, v15.4h, v4.h[3] |
| smlsl v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v3.h[2] |
| smlsl v22.4s, v12.4h, v3.h[0] |
| smlsl v22.4s, v13.4h, v2.h[2] |
| smlal v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v1.h[2] |
| smlsl v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v0.h[2] |
| |
| stage2_shift2: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x0],#16 |
| st1 { v12.4h, v13.4h},[x0],#16 |
| st1 { v14.4h, v15.4h},[x0],#16 |
| st1 { v18.4h, v19.4h},[x0],#16 |
| |
| |
| mov x1,x4 |
| |
| |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v7.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v6.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v5.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v4.h[2] |
| |
| cmp x12,x11 |
| bhs stage2_shift3 |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| smlsl v24.4s, v14.4h, v5.h[1] |
| smlsl v26.4s, v14.4h, v7.h[3] |
| smlal v28.4s, v14.4h, v5.h[3] |
| smlal v30.4s, v14.4h, v3.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v2.h[1] |
| smlal v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v4.h[3] |
| smlsl v30.4s, v15.4h, v7.h[3] |
| |
| |
| smlsl v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v6.h[2] |
| smlsl v22.4s, v12.4h, v3.h[0] |
| smlal v22.4s, v13.4h, v3.h[2] |
| smlsl v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v0.h[2] |
| smlsl v18.4s, v12.4h, v7.h[0] |
| smlal v18.4s, v13.4h, v2.h[2] |
| |
| cmp x12,x5 |
| bhs stage2_shift3 |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v2.h[0] |
| smlsl v20.4s, v11.4h, v5.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v6.h[0] |
| smlsl v22.4s, v11.4h, v0.h[2] |
| |
| smlsl v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v4.h[2] |
| |
| smlsl v18.4s, v10.4h, v2.h[0] |
| smlal v18.4s, v11.4h, v6.h[2] |
| |
| cmp x12,x6 |
| bhs stage2_shift3 |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v7.h[1] |
| smlal v26.4s, v14.4h, v2.h[1] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlsl v30.4s, v14.4h, v5.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v0.h[3] |
| smlal v26.4s, v15.4h, v7.h[1] |
| smlsl v28.4s, v15.4h, v1.h[1] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v3.h[0] |
| smlal v20.4s, v13.4h, v4.h[2] |
| smlal v22.4s, v12.4h, v7.h[0] |
| smlal v22.4s, v13.4h, v2.h[2] |
| smlal v16.4s, v12.4h, v1.h[0] |
| smlsl v16.4s, v13.4h, v6.h[2] |
| smlal v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v0.h[2] |
| |
| cmp x12,x9 |
| bhs stage2_shift3 |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v5.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v1.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v7.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v6.h[3] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v1.h[3] |
| smlal v30.4s, v14.4h, v7.h[1] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v2.h[3] |
| smlal v28.4s, v15.4h, v7.h[1] |
| smlal v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v5.h[0] |
| smlal v20.4s, v13.4h, v2.h[2] |
| smlal v22.4s, v12.4h, v1.h[0] |
| smlsl v22.4s, v13.4h, v7.h[2] |
| smlsl v16.4s, v12.4h, v7.h[0] |
| smlsl v16.4s, v13.4h, v3.h[2] |
| smlsl v18.4s, v12.4h, v3.h[0] |
| smlal v18.4s, v13.4h, v1.h[2] |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v6.h[0] |
| smlsl v20.4s, v11.4h, v1.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v2.h[0] |
| smlal v22.4s, v11.4h, v4.h[2] |
| |
| smlal v16.4s, v10.4h, v2.h[0] |
| smlsl v16.4s, v11.4h, v7.h[2] |
| |
| smlsl v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v5.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| smlal v24.4s, v14.4h, v4.h[3] |
| smlsl v26.4s, v14.4h, v6.h[1] |
| smlal v28.4s, v14.4h, v7.h[3] |
| smlal v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v3.h[3] |
| smlsl v26.4s, v15.4h, v3.h[1] |
| smlal v28.4s, v15.4h, v2.h[3] |
| smlsl v30.4s, v15.4h, v2.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v0.h[2] |
| smlal v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v1.h[2] |
| smlsl v16.4s, v12.4h, v3.h[0] |
| smlal v16.4s, v13.4h, v2.h[2] |
| smlal v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v3.h[2] |
| |
| stage2_shift3: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x0],#16 |
| st1 { v12.4h, v13.4h},[x0],#16 |
| st1 { v14.4h, v15.4h},[x0],#16 |
| st1 { v18.4h, v19.4h},[x0],#16 |
| |
| |
| |
| mov x1,x4 |
| |
| |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v2.h[2] |
| |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v1.h[2] |
| |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v0.h[2] |
| |
| cmp x12,x11 |
| bhs stage2_shift4 |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v0.h[1] |
| smlal v26.4s, v14.4h, v1.h[3] |
| smlal v28.4s, v14.4h, v4.h[1] |
| smlal v30.4s, v14.4h, v6.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v4.h[1] |
| smlsl v26.4s, v15.4h, v0.h[3] |
| smlsl v28.4s, v15.4h, v2.h[3] |
| smlsl v30.4s, v15.4h, v6.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v7.h[0] |
| smlal v20.4s, v13.4h, v5.h[2] |
| smlal v22.4s, v12.4h, v5.h[0] |
| smlsl v22.4s, v13.4h, v7.h[2] |
| smlal v16.4s, v12.4h, v3.h[0] |
| smlsl v16.4s, v13.4h, v4.h[2] |
| smlal v18.4s, v12.4h, v1.h[0] |
| smlsl v18.4s, v13.4h, v1.h[2] |
| |
| cmp x12,x5 |
| bhs stage2_shift4 |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) |
| smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v2.h[0] |
| smlal v20.4s, v11.4h, v1.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v6.h[0] |
| smlal v22.4s, v11.4h, v3.h[2] |
| |
| smlal v16.4s, v10.4h, v6.h[0] |
| smlsl v16.4s, v11.4h, v7.h[2] |
| |
| smlal v18.4s, v10.4h, v2.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| |
| cmp x12,x6 |
| bhs stage2_shift4 |
| |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| |
| |
| smlsl v24.4s, v14.4h, v1.h[1] |
| smlsl v26.4s, v14.4h, v7.h[3] |
| smlal v28.4s, v14.4h, v1.h[3] |
| smlal v30.4s, v14.4h, v4.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v2.h[1] |
| smlal v26.4s, v15.4h, v5.h[1] |
| smlsl v28.4s, v15.4h, v3.h[1] |
| smlsl v30.4s, v15.4h, v4.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v5.h[0] |
| smlsl v20.4s, v13.4h, v7.h[2] |
| smlsl v22.4s, v12.4h, v1.h[0] |
| smlal v22.4s, v13.4h, v1.h[2] |
| smlsl v16.4s, v12.4h, v7.h[0] |
| smlal v16.4s, v13.4h, v5.h[2] |
| smlal v18.4s, v12.4h, v3.h[0] |
| smlsl v18.4s, v13.4h, v3.h[2] |
| |
| cmp x12,x9 |
| bhs stage2_shift4 |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlal v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v0.h[2] |
| |
| |
| smlsl v22.4s, v10.4h, v0.h[0] |
| smlal v22.4s, v11.4h, v6.h[2] |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v4.h[2] |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| |
| smlal v24.4s, v14.4h, v3.h[1] |
| smlsl v26.4s, v14.4h, v2.h[1] |
| smlal v28.4s, v14.4h, v7.h[3] |
| smlal v30.4s, v14.4h, v2.h[3] |
| |
| |
| smlsl v24.4s, v15.4h, v0.h[3] |
| smlal v26.4s, v15.4h, v4.h[3] |
| smlal v28.4s, v15.4h, v6.h[3] |
| smlsl v30.4s, v15.4h, v2.h[1] |
| |
| |
| smlal v20.4s, v12.4h, v3.h[0] |
| smlsl v20.4s, v13.4h, v6.h[2] |
| smlal v22.4s, v12.4h, v7.h[0] |
| smlsl v22.4s, v13.4h, v4.h[2] |
| smlsl v16.4s, v12.4h, v1.h[0] |
| smlal v16.4s, v13.4h, v0.h[2] |
| smlal v18.4s, v12.4h, v5.h[0] |
| smlsl v18.4s, v13.4h, v5.h[2] |
| |
| |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],x10 |
| |
| |
| |
| |
| smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| smlsl v20.4s, v10.4h, v6.h[0] |
| smlal v20.4s, v11.4h, v2.h[2] |
| |
| |
| smlal v22.4s, v10.4h, v2.h[0] |
| smlsl v22.4s, v11.4h, v0.h[2] |
| |
| smlsl v16.4s, v10.4h, v2.h[0] |
| smlal v16.4s, v11.4h, v3.h[2] |
| |
| smlal v18.4s, v10.4h, v6.h[0] |
| smlsl v18.4s, v11.4h, v6.h[2] |
| |
| |
| ld1 {v12.4h, v13.4h},[x1],#16 |
| ld1 {v14.4h, v15.4h},[x1],x10 |
| |
| |
| |
| smlsl v24.4s, v14.4h, v5.h[1] |
| smlal v26.4s, v14.4h, v3.h[3] |
| smlsl v28.4s, v14.4h, v2.h[1] |
| smlal v30.4s, v14.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v15.4h, v1.h[3] |
| smlsl v26.4s, v15.4h, v1.h[1] |
| smlal v28.4s, v15.4h, v0.h[3] |
| smlsl v30.4s, v15.4h, v0.h[1] |
| |
| |
| smlsl v20.4s, v12.4h, v1.h[0] |
| smlal v20.4s, v13.4h, v4.h[2] |
| smlal v22.4s, v12.4h, v3.h[0] |
| smlsl v22.4s, v13.4h, v5.h[2] |
| smlsl v16.4s, v12.4h, v5.h[0] |
| smlal v16.4s, v13.4h, v6.h[2] |
| smlal v18.4s, v12.4h, v7.h[0] |
| smlsl v18.4s, v13.4h, v7.h[2] |
| |
| stage2_shift4: |
| add v8.4s, v20.4s , v24.4s |
| sub v10.4s, v20.4s , v24.4s |
| |
| add v12.4s, v22.4s , v26.4s |
| sub v24.4s, v22.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) |
| |
| |
| |
| umov x15,v24.d[0] |
| umov x16,v25.d[0] |
| umov x19,v26.d[0] |
| umov x20,v27.d[0] |
| |
| trn1 v24.4h, v30.4h, v12.4h |
| trn2 v25.4h, v30.4h, v12.4h |
| trn1 v26.4h, v31.4h, v13.4h |
| trn2 v27.4h, v31.4h, v13.4h |
| |
| trn1 v30.2s, v24.2s, v26.2s |
| trn2 v31.2s, v24.2s, v26.2s |
| trn1 v12.2s, v25.2s, v27.2s |
| trn2 v13.2s, v25.2s, v27.2s |
| |
| trn1 v24.4h, v14.4h, v18.4h |
| trn2 v25.4h, v14.4h, v18.4h |
| trn1 v26.4h, v15.4h, v19.4h |
| trn2 v27.4h, v15.4h, v19.4h |
| |
| trn1 v14.2s, v24.2s, v26.2s |
| trn2 v15.2s, v24.2s, v26.2s |
| trn1 v18.2s, v25.2s, v27.2s |
| trn2 v19.2s, v25.2s, v27.2s |
| |
| mov v24.d[0],x15 |
| mov v25.d[0],x16 |
| mov v26.d[0],x19 |
| mov v27.d[0],x20 |
| |
| st1 { v30.4h, v31.4h},[x0],#16 |
| st1 { v12.4h, v13.4h},[x0],#16 |
| st1 { v14.4h, v15.4h},[x0],#16 |
| st1 { v18.4h, v19.4h},[x0],#16 |
| |
| |
| |
| |
| sub x0,x0,#256 |
| prediction_buffer: |
| |
| |
| ld1 {v12.8h},[x0],#16 |
| ld1 {v14.8h},[x0],#16 |
| |
| add x0,x0,#32 |
| |
| ld1 {v16.8h},[x0],#16 |
| ld1 {v18.8h},[x0],#16 |
| add x0,x0,#32 |
| |
| ld1 {v20.8h},[x0],#16 |
| ld1 {v22.8h},[x0],#16 |
| |
| |
| add x0,x0,#32 |
| |
| ld1 {v24.8h},[x0],#16 |
| ld1 {v26.8h},[x0],#16 |
| |
| |
| |
| |
| |
| // d12 =x0 1- 4 values |
| // d13 =x2 1- 4 values |
| // d14=x1 1- 4 values |
| // d15=x3 1- 4 values |
| |
| // d16 =x0 5- 8 values |
| // d17 =x2 5- 8 values |
| // d18=x1 5- 8 values |
| // d19=x3 5- 8 values |
| |
| // d20 =x0 9- 12 values |
| // d21 =x2 9- 12 values |
| // d22=x1 9- 12 values |
| // d23=x3 9- 12 values |
| |
| // d24 =x0 13-16 values |
| // d25 =x2 13- 16 values |
| // d26=x1 13- 16 values |
| // d27=x3 13- 16 values |
| |
| // swapping v12 upper and v16 lower 64bits |
| mov v13.d[0], v12.d[1] |
| mov v12.d[1], v16.d[0] |
| mov v16.d[0], v13.d[0] |
| // swapping v20 upper and v24 lower 64bits |
| mov v21.d[0], v20.d[1] |
| mov v20.d[1], v24.d[0] |
| mov v24.d[0], v21.d[0] |
| // swapping v14 uppper and v18 lower 64bits |
| mov v15.d[0], v14.d[1] |
| mov v14.d[1], v18.d[0] |
| mov v18.d[0], v15.d[0] |
| // swapping v22 upper and v26 lower 64bits |
| mov v23.d[0], v22.d[1] |
| mov v22.d[1], v26.d[0] |
| mov v26.d[0], v23.d[0] |
| |
| |
| ld1 {v8.8b, v9.8b},[x2],x8 |
| ld1 {v10.8b, v11.8b},[x2],x8 |
| ld1 {v28.8b, v29.8b},[x2],x8 |
| ld1 {v30.8b, v31.8b},[x2],x8 |
| |
| |
| uaddw v12.8h, v12.8h , v8.8b |
| uaddw v20.8h, v20.8h , v9.8b |
| uaddw v14.8h, v14.8h , v10.8b |
| uaddw v22.8h, v22.8h , v11.8b |
| uaddw v16.8h, v16.8h , v28.8b |
| uaddw v24.8h, v24.8h , v29.8b |
| uaddw v18.8h, v18.8h , v30.8b |
| uaddw v26.8h, v26.8h , v31.8b |
| sub x2,x2,x8,lsl #2 |
| add x2,x2,#16 |
| sqxtun v12.8b, v12.8h |
| sqxtun v13.8b, v20.8h |
| sqxtun v20.8b, v14.8h |
| sqxtun v21.8b, v22.8h |
| sqxtun v14.8b, v16.8h |
| sqxtun v15.8b, v24.8h |
| sqxtun v22.8b, v18.8h |
| sqxtun v23.8b, v26.8h |
| |
| |
| st1 {v12.8b, v13.8b},[x3],x7 |
| st1 {v20.8b, v21.8b},[x3],x7 |
| st1 {v14.8b, v15.8b},[x3],x7 |
| st1 {v22.8b, v23.8b},[x3],x7 |
| |
| |
| sub x3,x3,x7,lsl #2 |
| add x3,x3,#16 |
| |
| ld1 {v12.8h},[x0],#16 |
| ld1 {v14.8h},[x0],#16 |
| |
| sub x0,x0,#96 |
| |
| ld1 {v16.8h},[x0],#16 |
| ld1 {v18.8h},[x0],#16 |
| sub x0,x0,#96 |
| |
| ld1 {v20.8h},[x0],#16 |
| ld1 {v22.8h},[x0],#16 |
| |
| |
| sub x0,x0,#96 |
| |
| ld1 {v24.8h},[x0],#16 |
| ld1 {v26.8h},[x0],#16 |
| |
| |
| sub x0,x0,#64 |
| |
| |
| // swapping v12 upper and v16 lower 64bits |
| mov v13.d[0], v12.d[1] |
| mov v12.d[1], v16.d[0] |
| mov v16.d[0], v13.d[0] |
| // swapping v20 upper and v24 lower 64bits |
| mov v21.d[0], v20.d[1] |
| mov v20.d[1], v24.d[0] |
| mov v24.d[0], v21.d[0] |
| // swapping v14 uppper and v18 lower 64bits |
| mov v15.d[0], v14.d[1] |
| mov v14.d[1], v18.d[0] |
| mov v18.d[0], v15.d[0] |
| // swapping v22 upper and v26 lower 64bits |
| mov v23.d[0], v22.d[1] |
| mov v22.d[1], v26.d[0] |
| mov v26.d[0], v23.d[0] |
| |
| |
| ld1 {v8.8b, v9.8b},[x2],x8 |
| ld1 {v10.8b, v11.8b},[x2],x8 |
| ld1 {v28.8b, v29.8b},[x2],x8 |
| ld1 {v30.8b, v31.8b},[x2],x8 |
| |
| |
| uaddw v12.8h, v12.8h , v8.8b |
| uaddw v20.8h, v20.8h , v9.8b |
| uaddw v14.8h, v14.8h , v10.8b |
| uaddw v22.8h, v22.8h , v11.8b |
| uaddw v16.8h, v16.8h , v28.8b |
| uaddw v24.8h, v24.8h , v29.8b |
| uaddw v18.8h, v18.8h , v30.8b |
| uaddw v26.8h, v26.8h , v31.8b |
| sub x2,x2,#16 |
| |
| sqxtun v12.8b, v12.8h |
| sqxtun v13.8b, v20.8h |
| sqxtun v20.8b, v14.8h |
| sqxtun v21.8b, v22.8h |
| sqxtun v14.8b, v16.8h |
| sqxtun v15.8b, v24.8h |
| sqxtun v22.8b, v18.8h |
| sqxtun v23.8b, v26.8h |
| |
| |
| st1 {v12.8b, v13.8b},[x3],x7 |
| st1 {v20.8b, v21.8b},[x3],x7 |
| st1 {v14.8b, v15.8b},[x3],x7 |
| st1 {v22.8b, v23.8b},[x3],x7 |
| |
| sub x3,x3,#16 |
| |
| subs x14,x14,#1 |
| bne dct_stage2 |
| // ldmfd sp!,{x0-x12,pc} |
| ldp x19, x20,[sp],#16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |