| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ihevc_itrans_recon_8x8_neon.s |
| // * |
| // * @brief |
| // * contains function definitions for single stage inverse transform |
| // * |
| // * @author |
| // * anand s |
| // * |
| // * @par list of functions: |
| // * - ihevc_itrans_recon_16x16() |
| // * |
| // * @remarks |
| // * none |
| // * |
| // ******************************************************************************* |
| //*/ |
| |
| ///** |
| // ******************************************************************************* |
| // * |
| // * @brief |
| // * this function performs inverse transform and reconstruction for 8x8 |
| // * input block |
| // * |
| // * @par description: |
| // * performs inverse transform and adds the prediction data and clips output |
| // * to 8 bit |
| // * |
| // * @param[in] pi2_src |
| // * input 16x16 coefficients |
| // * |
| // * @param[in] pi2_tmp |
| // * temporary 16x16 buffer for storing inverse |
| // * |
| // * transform |
| // * 1st stage output |
| // * |
| // * @param[in] pu1_pred |
| // * prediction 16x16 block |
| // * |
| // * @param[out] pu1_dst |
| // * output 8x8 block |
| // * |
| // * @param[in] src_strd |
| // * input stride |
| // * |
| // * @param[in] pred_strd |
| // * prediction stride |
| // * |
| // * @param[in] dst_strd |
| // * output stride |
| // * |
| // * @param[in] shift |
| // * output shift |
| // * |
| // * @param[in] x12 |
| // * zero columns in pi2_src |
| // * |
| // * @returns void |
| // * |
| // * @remarks |
| // * none |
| // * |
| // ******************************************************************************* |
| // */ |
| |
| //void ihevc_itrans_recon_16x16(word16 *pi2_src, |
| // word16 *pi2_tmp, |
| // uword8 *pu1_pred, |
| // uword8 *pu1_dst, |
| // word32 src_strd, |
| // word32 pred_strd, |
| // word32 dst_strd, |
| // word32 x12 |
| // word32 x11 ) |
| |
| //**************variables vs registers************************* |
| // x0 => *pi2_src |
| // x1 => *pi2_tmp |
| // x2 => *pu1_pred |
| // x3 => *pu1_dst |
| // src_strd |
| // pred_strd |
| // dst_strd |
| // x12 |
| // x11 |
| |
| .text |
| .align 4 |
| |
| .include "ihevc_neon_macros.s" |
| |
| |
| |
| |
| .set shift_stage1_idct , 7 |
| .set shift_stage2_idct , 12 |
| //#define zero_cols x12 |
| //#define zero_rows x11 |
| .globl ihevc_itrans_recon_16x16_av8 |
| |
| .extern g_ai2_ihevc_trans_16_transpose |
| |
| .type ihevc_itrans_recon_16x16_av8, %function |
| |
| ihevc_itrans_recon_16x16_av8: |
| |
| ldr w11, [sp] |
| // stmfd sp!,{x4-x12,x14} |
| push_v_regs |
| stp x19, x20,[sp,#-16]! |
| stp x5, x6,[sp,#-16]! |
| // add sp,sp,#40 |
| |
| |
| |
| // ldr x8,[sp,#4] @ prediction stride |
| // ldr x7,[sp,#8] @ destination stride |
| mov x6, x4 // src stride |
| mov x12, x7 |
| |
| |
| |
| adrp x14, :got:g_ai2_ihevc_trans_16_transpose |
| ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose] |
| ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data |
| mov x7,#0xffff |
| and x12,x12,x7 |
| and x11,x11,x7 |
| lsl x6, x6, #1 // x sizeof(word16) |
| add x9,x0,x6, lsl #1 // 2 rows |
| |
| add x10,x6,x6, lsl #1 // 3 rows |
| add x5,x6,x6,lsl #2 |
| mov x7,#0xfff0 |
| |
| cmp x12,x7 |
| bge zero_12cols_decision |
| |
| mov x19,#0xff00 |
| cmp x12,x19 |
| bge zero_8cols_decision |
| |
| |
| |
| |
| mov x14,#4 |
| cmp x11,x7 |
| sub x20,x6,#0 |
| neg x20, x20 |
| csel x10,x20,x10,ge |
| |
| mov x19,#0xff00 |
| cmp x11,x19 |
| csel x8, x5, x8,ge |
| sub x20,x8,#0 |
| neg x20, x20 |
| csel x8,x20,x8,ge |
| csel x8, x10, x8,lt |
| add x5,x5,x6,lsl #3 |
| sub x20,x5,#0 |
| neg x5, x20 |
| |
| b first_stage_top_four_bottom_four |
| |
| zero_12cols_decision: |
| mov x14,#1 |
| mov x19,#0xff00 |
| cmp x11,x19 |
| csel x8, x5, x8,ge |
| csel x8, x10, x8,lt |
| add x5,x5,x6,lsl #3 |
| sub x20,x5,#0 |
| neg x5, x20 |
| |
| b first_stage_top_four_bottom_four |
| |
| zero_8cols_decision: |
| mov x14,#2 |
| mov x8,x5 |
| sub x20,x8,#0 |
| neg x8, x20 |
| mov x19,#0xff00 |
| cmp x11,x19 |
| csel x8, x10, x8,lt |
| add x5,x5,x6,lsl #3 |
| sub x20,x5,#0 |
| neg x5, x20 |
| cmp x11,x7 |
| sub x20,x6,#0 |
| neg x20, x20 |
| csel x10,x20,x10,ge |
| |
| |
| b first_stage_top_four_bottom_four |
| |
| |
| //d0[0]= 64 d2[0]=64 |
| //d0[1]= 90 d2[1]=57 |
| //d0[2]= 89 d2[2]=50 |
| //d0[3]= 87 d2[3]=43 |
| //d1[0]= 83 d3[0]=36 |
| //d1[1]= 80 d3[1]=25 |
| //d1[2]= 75 d3[2]=18 |
| //d1[3]= 70 d3[3]=9 |
| |
| |
| |
| first_stage: |
| add x0,x0,#8 |
| add x9,x9,#8 |
| |
| first_stage_top_four_bottom_four: |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v11.4h},[x9],x6 |
| ld1 {v6.4h},[x0],x10 |
| ld1 {v7.4h},[x9],x10 |
| cmp x11,x7 |
| bge skip_load4rows |
| |
| ld1 {v4.4h},[x0],x6 |
| ld1 {v5.4h},[x9],x6 |
| ld1 {v8.4h},[x0],x8 |
| ld1 {v9.4h},[x9],x8 |
| |
| // registers used: q0,q1,q3,q5,q2,q4 |
| |
| // d10 =x0 |
| //d6= x1 |
| //d11=x2 |
| //d7=x3 |
| |
| skip_load4rows: |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| |
| smull v12.4s, v10.4h, v0.h[0] |
| smlal v12.4s, v11.4h, v0.h[2] |
| smull v14.4s, v10.4h, v0.h[0] |
| smlal v14.4s, v11.4h, v1.h[2] |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| |
| bge skip_last12rows_kernel1 |
| |
| |
| smlal v24.4s, v8.4h, v1.h[1] |
| smlal v26.4s, v8.4h, v3.h[3] |
| smlsl v28.4s, v8.4h, v1.h[3] |
| smlsl v30.4s, v8.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v1.h[3] |
| smlsl v26.4s, v9.4h, v2.h[3] |
| smlsl v28.4s, v9.4h, v0.h[3] |
| smlal v30.4s, v9.4h, v3.h[3] |
| |
| |
| |
| |
| |
| smlal v12.4s, v4.4h, v1.h[0] |
| smlal v12.4s, v5.4h, v1.h[2] |
| smlal v14.4s, v4.4h, v3.h[0] |
| smlsl v14.4s, v5.4h, v3.h[2] |
| smlsl v16.4s, v4.4h, v3.h[0] |
| smlsl v16.4s, v5.4h, v0.h[2] |
| smlsl v18.4s, v4.4h, v1.h[0] |
| smlsl v18.4s, v5.4h, v2.h[2] |
| |
| //d0[0]= 64 d2[0]=64 |
| //d0[1]= 90 d2[1]=57 |
| //d0[2]= 89 d2[2]=50 |
| //d0[3]= 87 d2[3]=43 |
| //d1[0]= 83 d3[0]=36 |
| //d1[1]= 80 d3[1]=25 |
| //d1[2]= 75 d3[2]=18 |
| //d1[3]= 70 d3[3]=9 |
| mov x19,#0xff00 |
| cmp x11,x19 |
| bge skip_last12rows_kernel1 |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v11.4h},[x9],x6 |
| ld1 {v6.4h},[x0],x10 |
| ld1 {v7.4h},[x9],x10 |
| ld1 {v4.4h},[x0],x6 |
| ld1 {v5.4h},[x9],x6 |
| ld1 {v8.4h},[x0],x5 |
| ld1 {v9.4h},[x9],x5 |
| |
| |
| |
| |
| smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| smlal v24.4s, v8.4h, v3.h[1] |
| smlsl v26.4s, v8.4h, v1.h[3] |
| smlal v28.4s, v8.4h, v0.h[1] |
| smlsl v30.4s, v8.4h, v1.h[1] |
| |
| |
| smlal v24.4s, v9.4h, v3.h[3] |
| smlsl v26.4s, v9.4h, v3.h[1] |
| smlal v28.4s, v9.4h, v2.h[3] |
| smlsl v30.4s, v9.4h, v2.h[1] |
| |
| |
| |
| |
| |
| smlal v12.4s, v10.4h, v0.h[0] |
| smlal v12.4s, v11.4h, v2.h[2] |
| smlal v12.4s, v4.4h, v3.h[0] |
| smlal v12.4s, v5.4h, v3.h[2] |
| |
| |
| |
| |
| smlsl v14.4s, v10.4h, v0.h[0] |
| smlsl v14.4s, v11.4h, v0.h[2] |
| smlsl v14.4s, v4.4h, v1.h[0] |
| smlsl v14.4s, v5.4h, v2.h[2] |
| |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v3.h[2] |
| smlal v16.4s, v4.4h, v1.h[0] |
| smlal v16.4s, v5.4h, v1.h[2] |
| |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v1.h[2] |
| smlsl v18.4s, v4.4h, v3.h[0] |
| smlsl v18.4s, v5.4h, v0.h[2] |
| |
| skip_last12rows_kernel1: |
| add v20.4s, v12.4s , v24.4s |
| sub v22.4s, v12.4s , v24.4s |
| |
| add v12.4s, v14.4s , v26.4s |
| sub v24.4s, v14.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| |
| |
| |
| |
| |
| sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| st1 {v30.4h, v31.4h},[x1],#16 |
| st1 {v18.4h, v19.4h},[x1],#16 |
| sub x1,x1,#32 |
| |
| bge skip_stage1_kernel_load |
| |
| first_stage_middle_eight: |
| |
| |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v11.4h},[x9],x6 |
| ld1 {v6.4h},[x0],x10 |
| ld1 {v7.4h},[x9],x10 |
| ld1 {v4.4h},[x0],x6 |
| ld1 {v5.4h},[x9],x6 |
| ld1 {v8.4h},[x0],x8 |
| ld1 {v9.4h},[x9],x8 |
| |
| |
| skip_stage1_kernel_load: |
| smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v3.h[2] |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v2.h[2] |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v1.h[2] |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v0.h[2] |
| |
| |
| cmp x11,x7 |
| bge skip_last12rows_kernel2 |
| |
| smlsl v24.4s, v8.4h, v3.h[1] |
| smlal v26.4s, v8.4h, v2.h[1] |
| smlal v28.4s, v8.4h, v0.h[1] |
| smlal v30.4s, v8.4h, v2.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v0.h[1] |
| smlal v26.4s, v9.4h, v3.h[1] |
| smlsl v28.4s, v9.4h, v1.h[1] |
| smlsl v30.4s, v9.4h, v2.h[1] |
| |
| |
| |
| smlsl v22.4s, v4.4h, v1.h[0] |
| smlal v22.4s, v5.4h, v2.h[2] |
| smlsl v20.4s, v4.4h, v3.h[0] |
| smlal v20.4s, v5.4h, v0.h[2] |
| smlal v16.4s, v4.4h, v3.h[0] |
| smlal v16.4s, v5.4h, v3.h[2] |
| smlal v18.4s, v4.4h, v1.h[0] |
| smlsl v18.4s, v5.4h, v1.h[2] |
| |
| //d0[0]= 64 d2[0]=64 |
| //d0[1]= 90 d2[1]=57 |
| //d0[2]= 89 d2[2]=50 |
| //d0[3]= 87 d2[3]=43 |
| //d1[0]= 83 d3[0]=36 |
| //d1[1]= 80 d3[1]=25 |
| //d1[2]= 75 d3[2]=18 |
| //d1[3]= 70 d3[3]=9 |
| mov x19,#0xff00 |
| cmp x11,x19 |
| bge skip_last12rows_kernel2 |
| |
| ld1 {v10.4h},[x0],x6 |
| ld1 {v11.4h},[x9],x6 |
| ld1 {v6.4h},[x0],x10 |
| ld1 {v7.4h},[x9],x10 |
| ld1 {v4.4h},[x0],x6 |
| ld1 {v5.4h},[x9],x6 |
| ld1 {v8.4h},[x0],x5 |
| ld1 {v9.4h},[x9],x5 |
| |
| |
| smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| smlal v24.4s, v8.4h, v2.h[3] |
| smlal v26.4s, v8.4h, v3.h[3] |
| smlsl v28.4s, v8.4h, v2.h[1] |
| smlal v30.4s, v8.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v1.h[3] |
| smlsl v26.4s, v9.4h, v1.h[1] |
| smlal v28.4s, v9.4h, v0.h[3] |
| smlsl v30.4s, v9.4h, v0.h[1] |
| |
| |
| |
| |
| smlal v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v1.h[2] |
| smlsl v22.4s, v4.4h, v3.h[0] |
| smlal v22.4s, v5.4h, v0.h[2] |
| |
| |
| |
| smlsl v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| smlal v20.4s, v4.4h, v1.h[0] |
| smlsl v20.4s, v5.4h, v1.h[2] |
| |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| smlsl v16.4s, v4.4h, v1.h[0] |
| smlal v16.4s, v5.4h, v2.h[2] |
| |
| |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| smlal v18.4s, v4.4h, v3.h[0] |
| smlsl v18.4s, v5.4h, v3.h[2] |
| |
| skip_last12rows_kernel2: |
| |
| add v4.4s, v22.4s , v24.4s |
| sub v22.4s, v22.4s , v24.4s |
| |
| add v6.4s, v20.4s , v26.4s |
| sub v24.4s, v20.4s , v26.4s |
| |
| add v10.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| |
| // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} |
| |
| |
| |
| |
| |
| |
| ld1 {v4.4h, v5.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],#16 |
| sub x1,x1,#32 |
| |
| //d4=x0 |
| //d12=x1 |
| //d5=x2 |
| //d13=x3 |
| |
| //d18=x4 |
| //d20=x5 |
| //d19=x6 |
| //d21=x7 |
| |
| //d22=x8 |
| //d30=x9 |
| //d23=x10 |
| //d31=x11 |
| |
| //d14=x12 |
| //d8=x13 |
| //d15=x14 |
| //d9=x15 |
| |
| umov x15,v26.d[0] |
| umov x16,v27.d[0] |
| umov x19,v28.d[0] |
| umov x20,v29.d[0] |
| |
| trn1 v26.4h, v4.4h, v12.4h |
| trn2 v27.4h, v4.4h, v12.4h |
| trn1 v28.4h, v5.4h, v13.4h |
| trn2 v29.4h, v5.4h, v13.4h |
| |
| trn1 v4.2s, v26.2s, v28.2s |
| trn2 v5.2s, v26.2s, v28.2s |
| trn1 v12.2s, v27.2s, v29.2s |
| trn2 v13.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v18.4h, v20.4h |
| trn2 v27.4h, v18.4h, v20.4h |
| trn1 v28.4h, v19.4h, v21.4h |
| trn2 v29.4h, v19.4h, v21.4h |
| |
| trn1 v18.2s, v26.2s, v28.2s |
| trn2 v19.2s, v26.2s, v28.2s |
| trn1 v20.2s, v27.2s, v29.2s |
| trn2 v21.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v22.4h, v30.4h |
| trn2 v27.4h, v22.4h, v30.4h |
| trn1 v28.4h, v23.4h, v31.4h |
| trn2 v29.4h, v23.4h, v31.4h |
| |
| trn1 v22.2s, v26.2s, v28.2s |
| trn2 v23.2s, v26.2s, v28.2s |
| trn1 v30.2s, v27.2s, v29.2s |
| trn2 v31.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v14.4h, v8.4h |
| trn2 v27.4h, v14.4h, v8.4h |
| trn1 v28.4h, v15.4h, v9.4h |
| trn2 v29.4h, v15.4h, v9.4h |
| |
| trn1 v14.2s, v26.2s, v28.2s |
| trn2 v15.2s, v26.2s, v28.2s |
| trn1 v8.2s, v27.2s, v29.2s |
| trn2 v9.2s, v27.2s, v29.2s |
| |
| mov v26.d[0],x15 |
| mov v27.d[0],x16 |
| mov v28.d[0],x19 |
| mov v29.d[0],x20 |
| |
| // d4 =x0 1- 4 values |
| // d5 =x2 1- 4 values |
| // d12=x1 1- 4 values |
| // d13=x3 1- 4 values |
| |
| // d18 =x0 5- 8 values |
| // d19 =x2 5- 8 values |
| // d20=x1 5- 8 values |
| // d21=x3 5- 8 values |
| |
| // d22 =x0 9- 12 values |
| // d23 =x2 9- 12 values |
| // d30=x1 9- 12 values |
| // d31=x3 9- 12 values |
| |
| // d14 =x0 13-16 values |
| // d15 =x2 13- 16 values |
| // d8=x1 13- 16 values |
| // d9=x3 13- 16 values |
| |
| |
| st1 { v4.4h, v5.4h},[x1],#16 |
| st1 { v12.4h, v13.4h},[x1],#16 |
| |
| st1 { v18.4h, v19.4h},[x1],#16 |
| st1 { v20.4h, v21.4h},[x1],#16 |
| st1 { v22.4h, v23.4h},[x1],#16 |
| st1 { v30.4h, v31.4h},[x1],#16 |
| st1 { v14.4h, v15.4h},[x1],#16 |
| st1 { v8.4h, v9.4h},[x1],#16 |
| |
| |
| subs x14,x14,#1 |
| bne first_stage |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| mov x6,x7 |
| |
| ldp x8, x7,[sp],#16 |
| |
| mov x10,#16 |
| |
| cmp x12,x6 |
| sub x20,x1,#128 |
| csel x1, x20, x1,ge |
| bge label1 |
| |
| mov x19,#0xff00 |
| cmp x12,x19 |
| sub x20,x1,#256 |
| csel x1, x20, x1,ge |
| bge label_2 |
| |
| sub x1,x1,#512 |
| sub x20,x10,#0 |
| neg x10, x20 |
| |
| label_2: |
| add x9,x1,#128 |
| add x11,x9,#128 |
| add x0,x11,#128 |
| |
| |
| |
| label1: |
| // mov x6,x1 |
| |
| |
| mov x14,#4 |
| add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data |
| add x5,x8,x8, lsl #1 // |
| // add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data |
| // add x10,x7,x7, lsl #1 @ |
| |
| |
| |
| |
| second_stage: |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v6.4h, v7.4h},[x1],x10 |
| cmp x12,x6 |
| bge second_stage_process |
| ld1 {v4.4h, v5.4h},[x9],#16 |
| ld1 {v8.4h, v9.4h},[x9],x10 |
| |
| second_stage_process: |
| |
| |
| smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| smull v12.4s, v10.4h, v0.h[0] |
| smlal v12.4s, v11.4h, v0.h[2] |
| smull v14.4s, v10.4h, v0.h[0] |
| smlal v14.4s, v11.4h, v1.h[2] |
| smull v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v2.h[2] |
| smull v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v3.h[2] |
| |
| bge skip_last8rows_stage2_kernel1 |
| |
| smlal v24.4s, v8.4h, v1.h[1] |
| smlal v26.4s, v8.4h, v3.h[3] |
| smlsl v28.4s, v8.4h, v1.h[3] |
| smlsl v30.4s, v8.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v1.h[3] |
| smlsl v26.4s, v9.4h, v2.h[3] |
| smlsl v28.4s, v9.4h, v0.h[3] |
| smlal v30.4s, v9.4h, v3.h[3] |
| |
| |
| smlal v12.4s, v4.4h, v1.h[0] |
| smlal v12.4s, v5.4h, v1.h[2] |
| smlal v14.4s, v4.4h, v3.h[0] |
| smlsl v14.4s, v5.4h, v3.h[2] |
| smlsl v16.4s, v4.4h, v3.h[0] |
| smlsl v16.4s, v5.4h, v0.h[2] |
| smlsl v18.4s, v4.4h, v1.h[0] |
| smlsl v18.4s, v5.4h, v2.h[2] |
| |
| mov x19,#0xff00 |
| cmp x12,x19 |
| bge skip_last8rows_stage2_kernel1 |
| |
| |
| ld1 {v10.4h, v11.4h},[x11],#16 |
| ld1 {v6.4h, v7.4h},[x11],x10 |
| ld1 {v4.4h, v5.4h},[x0],#16 |
| ld1 {v8.4h, v9.4h},[x0],x10 |
| |
| |
| |
| |
| |
| smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) |
| smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) |
| |
| smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| |
| smlal v24.4s, v8.4h, v3.h[1] |
| smlsl v26.4s, v8.4h, v1.h[3] |
| smlal v28.4s, v8.4h, v0.h[1] |
| smlsl v30.4s, v8.4h, v1.h[1] |
| |
| |
| smlal v24.4s, v9.4h, v3.h[3] |
| smlsl v26.4s, v9.4h, v3.h[1] |
| smlal v28.4s, v9.4h, v2.h[3] |
| smlsl v30.4s, v9.4h, v2.h[1] |
| |
| |
| |
| |
| |
| smlal v12.4s, v10.4h, v0.h[0] |
| smlal v12.4s, v11.4h, v2.h[2] |
| smlal v12.4s, v4.4h, v3.h[0] |
| smlal v12.4s, v5.4h, v3.h[2] |
| |
| |
| |
| |
| smlsl v14.4s, v10.4h, v0.h[0] |
| smlsl v14.4s, v11.4h, v0.h[2] |
| smlsl v14.4s, v4.4h, v1.h[0] |
| smlsl v14.4s, v5.4h, v2.h[2] |
| |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v3.h[2] |
| smlal v16.4s, v4.4h, v1.h[0] |
| smlal v16.4s, v5.4h, v1.h[2] |
| |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlal v18.4s, v11.4h, v1.h[2] |
| smlsl v18.4s, v4.4h, v3.h[0] |
| smlsl v18.4s, v5.4h, v0.h[2] |
| |
| |
| |
| |
| |
| |
| skip_last8rows_stage2_kernel1: |
| |
| |
| |
| add v20.4s, v12.4s , v24.4s |
| sub v22.4s, v12.4s , v24.4s |
| |
| add v12.4s, v14.4s , v26.4s |
| sub v24.4s, v14.4s , v26.4s |
| |
| add v14.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| |
| |
| |
| |
| |
| sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| bge skip_stage2_kernel_load |
| |
| //q2,q4,q6,q7 is used |
| ld1 {v10.4h, v11.4h},[x1],#16 |
| ld1 {v6.4h, v7.4h},[x1],#16 |
| ld1 {v4.4h, v5.4h},[x9],#16 |
| ld1 {v8.4h, v9.4h},[x9],#16 |
| skip_stage2_kernel_load: |
| sub x1,x1,#32 |
| st1 {v30.4h, v31.4h},[x1],#16 |
| st1 {v18.4h, v19.4h},[x1],#16 |
| sub x1,x1,#32 |
| |
| smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) |
| smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) |
| smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) |
| smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| smull v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v3.h[2] |
| smull v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v2.h[2] |
| smull v16.4s, v10.4h, v0.h[0] |
| smlsl v16.4s, v11.4h, v1.h[2] |
| smull v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v0.h[2] |
| |
| |
| |
| cmp x12,x6 |
| bge skip_last8rows_stage2_kernel2 |
| |
| |
| smlsl v24.4s, v8.4h, v3.h[1] |
| smlal v26.4s, v8.4h, v2.h[1] |
| smlal v28.4s, v8.4h, v0.h[1] |
| smlal v30.4s, v8.4h, v2.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v0.h[1] |
| smlal v26.4s, v9.4h, v3.h[1] |
| smlsl v28.4s, v9.4h, v1.h[1] |
| smlsl v30.4s, v9.4h, v2.h[1] |
| |
| |
| |
| smlsl v22.4s, v4.4h, v1.h[0] |
| smlal v22.4s, v5.4h, v2.h[2] |
| smlsl v20.4s, v4.4h, v3.h[0] |
| smlal v20.4s, v5.4h, v0.h[2] |
| smlal v16.4s, v4.4h, v3.h[0] |
| smlal v16.4s, v5.4h, v3.h[2] |
| smlal v18.4s, v4.4h, v1.h[0] |
| smlsl v18.4s, v5.4h, v1.h[2] |
| mov x19,#0xff00 |
| cmp x12,x19 |
| bge skip_last8rows_stage2_kernel2 |
| |
| ld1 {v10.4h, v11.4h},[x11],#16 |
| ld1 {v6.4h, v7.4h},[x11],#16 |
| ld1 {v4.4h, v5.4h},[x0],#16 |
| ld1 {v8.4h, v9.4h},[x0],#16 |
| |
| smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) |
| smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) |
| smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) |
| smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) |
| |
| smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) |
| smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) |
| smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) |
| smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) |
| |
| |
| smlal v24.4s, v8.4h, v2.h[3] |
| smlal v26.4s, v8.4h, v3.h[3] |
| smlsl v28.4s, v8.4h, v2.h[1] |
| smlal v30.4s, v8.4h, v0.h[3] |
| |
| |
| smlal v24.4s, v9.4h, v1.h[3] |
| smlsl v26.4s, v9.4h, v1.h[1] |
| smlal v28.4s, v9.4h, v0.h[3] |
| smlsl v30.4s, v9.4h, v0.h[1] |
| |
| |
| |
| |
| smlal v22.4s, v10.4h, v0.h[0] |
| smlsl v22.4s, v11.4h, v1.h[2] |
| smlsl v22.4s, v4.4h, v3.h[0] |
| smlal v22.4s, v5.4h, v0.h[2] |
| |
| |
| |
| smlsl v20.4s, v10.4h, v0.h[0] |
| smlsl v20.4s, v11.4h, v3.h[2] |
| smlal v20.4s, v4.4h, v1.h[0] |
| smlsl v20.4s, v5.4h, v1.h[2] |
| |
| |
| smlsl v16.4s, v10.4h, v0.h[0] |
| smlal v16.4s, v11.4h, v0.h[2] |
| smlsl v16.4s, v4.4h, v1.h[0] |
| smlal v16.4s, v5.4h, v2.h[2] |
| |
| |
| |
| smlal v18.4s, v10.4h, v0.h[0] |
| smlsl v18.4s, v11.4h, v2.h[2] |
| smlal v18.4s, v4.4h, v3.h[0] |
| smlsl v18.4s, v5.4h, v3.h[2] |
| |
| |
| skip_last8rows_stage2_kernel2: |
| |
| |
| |
| add v4.4s, v22.4s , v24.4s |
| sub v22.4s, v22.4s , v24.4s |
| |
| add v6.4s, v20.4s , v26.4s |
| sub v24.4s, v20.4s , v26.4s |
| |
| add v10.4s, v16.4s , v28.4s |
| sub v26.4s, v16.4s , v28.4s |
| |
| |
| add v16.4s, v18.4s , v30.4s |
| sub v28.4s, v18.4s , v30.4s |
| |
| |
| sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) |
| sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) |
| |
| ld1 {v4.4h, v5.4h},[x1],#16 |
| ld1 {v8.4h, v9.4h},[x1],#16 |
| |
| |
| |
| // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} |
| |
| //d4=x0 |
| //d12=x1 |
| //d5=x2 |
| //d13=x3 |
| |
| //d18=x4 |
| //d20=x5 |
| //d19=x6 |
| //d21=x7 |
| |
| //d22=x8 |
| //d30=x9 |
| //d23=x10 |
| //d31=x11 |
| |
| //d14=x12 |
| //d8=x13 |
| //d15=x14 |
| //d9=x15 |
| |
| umov x15,v26.d[0] |
| umov x16,v27.d[0] |
| umov x19,v28.d[0] |
| umov x20,v29.d[0] |
| |
| trn1 v26.4h, v4.4h, v12.4h |
| trn2 v27.4h, v4.4h, v12.4h |
| trn1 v28.4h, v5.4h, v13.4h |
| trn2 v29.4h, v5.4h, v13.4h |
| |
| trn1 v4.2s, v26.2s, v28.2s |
| trn2 v5.2s, v26.2s, v28.2s |
| trn1 v12.2s, v27.2s, v29.2s |
| trn2 v13.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v18.4h, v20.4h |
| trn2 v27.4h, v18.4h, v20.4h |
| trn1 v28.4h, v19.4h, v21.4h |
| trn2 v29.4h, v19.4h, v21.4h |
| |
| trn1 v18.2s, v26.2s, v28.2s |
| trn2 v19.2s, v26.2s, v28.2s |
| trn1 v20.2s, v27.2s, v29.2s |
| trn2 v21.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v22.4h, v30.4h |
| trn2 v27.4h, v22.4h, v30.4h |
| trn1 v28.4h, v23.4h, v31.4h |
| trn2 v29.4h, v23.4h, v31.4h |
| |
| trn1 v22.2s, v26.2s, v28.2s |
| trn2 v23.2s, v26.2s, v28.2s |
| trn1 v30.2s, v27.2s, v29.2s |
| trn2 v31.2s, v27.2s, v29.2s |
| |
| trn1 v26.4h, v14.4h, v8.4h |
| trn2 v27.4h, v14.4h, v8.4h |
| trn1 v28.4h, v15.4h, v9.4h |
| trn2 v29.4h, v15.4h, v9.4h |
| |
| trn1 v14.2s, v26.2s, v28.2s |
| trn2 v15.2s, v26.2s, v28.2s |
| trn1 v8.2s, v27.2s, v29.2s |
| trn2 v9.2s, v27.2s, v29.2s |
| |
| mov v26.d[0],x15 |
| mov v27.d[0],x16 |
| mov v28.d[0],x19 |
| mov v29.d[0],x20 |
| |
| // d4 =x0 1- 4 values |
| // d5 =x2 1- 4 values |
| // d12=x1 1- 4 values |
| // d13=x3 1- 4 values |
| |
| // d18 =x0 5- 8 values |
| // d19 =x2 5- 8 values |
| // d20=x1 5- 8 values |
| // d21=x3 5- 8 values |
| |
| // d22 =x0 9- 12 values |
| // d23 =x2 9- 12 values |
| // d30=x1 9- 12 values |
| // d31=x3 9- 12 values |
| |
| // d14 =x0 13-16 values |
| // d15 =x2 13- 16 values |
| // d8=x1 13- 16 values |
| // d9=x3 13- 16 values |
| |
| // swapping v5 and v15 |
| mov v5.d[1],v5.d[0] |
| mov v5.d[0],v18.d[0] |
| mov v18.d[0],v5.d[1] |
| // swapping v23 and v14 |
| mov v23.d[1],v23.d[0] |
| mov v23.d[0],v14.d[0] |
| mov v14.d[0],v23.d[1] |
| // swapping v13 and v20 |
| mov v13.d[1],v13.d[0] |
| mov v13.d[0],v20.d[0] |
| mov v20.d[0],v13.d[1] |
| // swapping v31 and v8 |
| mov v31.d[1],v31.d[0] |
| mov v31.d[0],v8.d[0] |
| mov v8.d[0],v31.d[1] |
| |
| // q2: x0 1-8 values |
| // q11: x0 9-16 values |
| // q9 : x2 1-8 values |
| // q7 : x2 9-16 values |
| // q6 : x1 1- 8 values |
| // q10: x3 1-8 values |
| // q15: x1 9-16 values |
| // q4: x3 9-16 values |
| |
| |
| // registers free: q8,q14,q12,q13 |
| |
| |
| ld1 {v16.8b, v17.8b},[x2],x8 |
| ld1 {v28.8b, v29.8b},[x2],x5 |
| ld1 {v24.8b, v25.8b},[x4],x8 |
| ld1 {v26.8b, v27.8b},[x4],x5 |
| |
| mov v4.d[1] ,v5.d[0] |
| mov v22.d[1] ,v23.d[0] |
| mov v12.d[1] ,v13.d[0] |
| mov v30.d[1] ,v31.d[0] |
| mov v18.d[1] ,v19.d[0] |
| mov v14.d[1] ,v15.d[0] |
| mov v20.d[1] ,v21.d[0] |
| mov v8.d[1] ,v9.d[0] |
| |
| uaddw v4.8h, v4.8h , v16.8b |
| uaddw v22.8h, v22.8h , v17.8b |
| uaddw v12.8h, v12.8h , v28.8b |
| uaddw v30.8h, v30.8h , v29.8b |
| uaddw v18.8h, v18.8h , v24.8b |
| uaddw v14.8h, v14.8h , v25.8b |
| uaddw v20.8h, v20.8h , v26.8b |
| uaddw v8.8h, v8.8h , v27.8b |
| |
| |
| sqxtun v16.8b, v4.8h |
| sqxtun v17.8b, v22.8h |
| sqxtun v28.8b, v12.8h |
| sqxtun v29.8b, v30.8h |
| sqxtun v24.8b, v18.8h |
| sqxtun v25.8b, v14.8h |
| sqxtun v26.8b, v20.8h |
| sqxtun v27.8b, v8.8h |
| |
| |
| |
| st1 {v16.8b, v17.8b},[x3],x7 |
| st1 {v28.8b, v29.8b},[x3],x7 |
| st1 {v24.8b, v25.8b},[x3],x7 |
| st1 {v26.8b, v27.8b},[x3],x7 |
| |
| subs x14,x14,#1 |
| |
| |
| |
| bne second_stage |
| |
| |
| // sub sp,sp,#40 |
| // ldmfd sp!,{x4-x12,pc} |
| ldp x19, x20,[sp],#16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |