| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| //******************************************************************************* |
| //* ,:file |
| //* ihevc_sao_band_offset_chroma.s |
| //* |
| //* ,:brief |
| //* Contains function definitions for inter prediction interpolation. |
| //* Functions are coded using NEON intrinsics and can be compiled using@ ARM |
| //* RVCT |
| //* |
| //* ,:author |
| //* Parthiban V |
| //* |
| //* ,:par List of Functions: |
| //* |
| //* |
| //* ,:remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // UWORD8 *pu1_src_left, |
| // UWORD8 *pu1_src_top, |
| // UWORD8 *pu1_src_top_left, |
| // WORD32 sao_band_pos_u, |
| // WORD32 sao_band_pos_v, |
| // WORD8 *pi1_sao_offset_u, |
| // WORD8 *pi1_sao_offset_v, |
| // WORD32 wd, |
| // WORD32 ht) |
| // |
| //**************Variables Vs Registers***************************************** |
| //x0 => *pu1_src |
| //x1 => src_strd |
| //x2 => *pu1_src_left |
| //x3 => *pu1_src_top |
| //x4 => *pu1_src_top_left 40 |
| //x5 => sao_band_pos_u 44 |
| //x6 => sao_band_pos_v 48 |
| //x7 => *pi1_sao_offset_u 52 |
| //x8 => *pi1_sao_offset_v 56 |
| //x9 => wd 60 |
| //x10=> ht 64 |
| |
| .text |
| .p2align 2 |
| .include "ihevc_neon_macros.s" |
| |
| .globl gu1_table_band_idx |
| .globl ihevc_sao_band_offset_chroma_av8 |
| |
| ihevc_sao_band_offset_chroma_av8: |
| mov x8,#0 |
| mov x9,#0 |
| mov x10,#0 |
| |
| ldr x8,[sp,#0] |
| ldr w9,[sp,#8] |
| ldr w10,[sp,#16] |
| |
| push_v_regs |
| // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments |
| stp x19, x20,[sp,#-16]! |
| stp x21, x22,[sp,#-16]! |
| stp x23, x24,[sp,#-16]! |
| |
| mov x15,x4 // pu1_src_top_left 40 |
| mov x16,x5 // sao_band_pos_u 44 |
| mov x17,x6 // sao_band_pos_v 48 |
| mov x19,x7 // pi1_sao_offset_u 52 |
| mov x20,x8 // pi1_sao_offset_v 56 |
| mov x21,x9 // wd 60 |
| mov x22,x10 // ht 64 |
| |
| MOV x4, x15 //Loads pu1_src_top_left |
| MOV x10, x22 //Loads ht |
| |
| MOV x9, x21 //Loads wd |
| MOV x11,x10 //Move the ht to x9 for loop counter |
| |
| ADD x12,x0,x9 //pu1_src[row * src_strd + (wd)] |
| ADRP x14, :got:gu1_table_band_idx |
| LDR x14, [x14, #:got_lo12:gu1_table_band_idx] |
| |
| SUB x12,x12,#2 //wd-2 |
| |
| SRC_LEFT_LOOP: |
| LDRH w5,[x12] //Load the value |
| ADD x12,x12,x1 |
| SUBS x11,x11,#1 //Decrement the loop counter |
| STRH w5,[x2],#2 //Store the value in pu1_src_left pointer |
| BNE SRC_LEFT_LOOP |
| |
| MOV x5, x16 //Loads sao_band_pos_u |
| LD1 {v1.8b},[x14],#8 //band_table_u.val[0] |
| ADD x12,x3,x9 //pu1_src_top[wd] |
| |
| sub x23,x12,#2 |
| LDRH w11,[x23] |
| LD1 {v2.8b},[x14],#8 //band_table_u.val[1] |
| LSL x6,x5,#3 //sao_band_pos_u |
| |
| STRH w11,[x4] //store to pu1_src_top_left[0] |
| LD1 {v3.8b},[x14],#8 //band_table_u.val[2] |
| MOV x7, x19 //Loads pi1_sao_offset_u |
| |
| SUB x4,x10,#1 //ht-1 |
| dup v31.8b,w6 //band_pos_u |
| mul x4, x4, x1 //ht-1 * src_strd |
| |
| ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] |
| LD1 {v4.8b},[x14],#8 //band_table_u.val[3] |
| MOV x11,x9 //Move the wd to x9 for loop counter |
| |
| SRC_TOP_LOOP: //wd is always multiple of 8 |
| LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col] |
| SUBS x11,x11,#8 //Decrement the loop counter by 8 |
| ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col] |
| BNE SRC_TOP_LOOP |
| |
| LD1 {v30.8b},[x7] //pi1_sao_offset_u load |
| ADD v5.8b, v1.8b , v31.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u) |
| |
| dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset_u[1]) |
| ADD v6.8b, v2.8b , v31.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u) |
| |
| dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset_u[2]) |
| ADD v7.8b, v3.8b , v31.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u) |
| |
| dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset_u[3]) |
| ADD v8.8b, v4.8b , v31.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u) |
| |
| CMP x5,#28 |
| dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset_u[4]) |
| ADRP x14, :got:gu1_table_band_idx |
| LDR x14, [x14, #:got_lo12:gu1_table_band_idx] |
| |
| movi v30.8b, #16 //vdup_n_u8(16) |
| ADD v1.8b, v5.8b , v29.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1])) |
| |
| LD1 {v9.8b},[x14],#8 //band_table_v.val[0] |
| ADD v2.8b, v6.8b , v28.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2])) |
| |
| LD1 {v10.8b},[x14],#8 //band_table_v.val[1] |
| ADD v3.8b, v7.8b , v27.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3])) |
| |
| MOV x6, x17 //Loads sao_band_pos_v |
| ADD v4.8b, v8.8b , v26.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4])) |
| LSL x11,x6,#3 //sao_band_pos_v |
| |
| BLT SAO_BAND_POS_U_0 |
| |
| SAO_BAND_POS_U_28: //case 28 |
| cmhs v13.8b, v30.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_U_29 |
| |
| ORR v4.8b, v4.8b , v13.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) |
| B SWITCH_BREAK_U |
| |
| SAO_BAND_POS_U_29: //case 29 |
| CMP x5,#29 |
| |
| cmhs v14.8b, v30.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_U_30 |
| ORR v3.8b, v3.8b , v14.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) |
| |
| AND v4.8b, v4.8b , v13.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) |
| B SWITCH_BREAK_U |
| |
| SAO_BAND_POS_U_30: //case 30 |
| CMP x5,#30 |
| |
| cmhs v15.8b, v30.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_U_31 |
| ORR v2.8b, v2.8b , v15.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) |
| |
| AND v3.8b, v3.8b , v14.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) |
| |
| SAO_BAND_POS_U_31: //case 31 |
| CMP x5,#31 |
| BNE SWITCH_BREAK_U |
| |
| cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| ORR v1.8b, v1.8b , v16.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) |
| |
| AND v2.8b, v2.8b , v15.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) |
| B SWITCH_BREAK_U |
| |
| SAO_BAND_POS_U_0: |
| CMP x5,#0 //case 0 |
| BNE SWITCH_BREAK_U |
| |
| cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| AND v1.8b, v1.8b , v16.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) |
| |
| SWITCH_BREAK_U: |
| dup v30.8b,w11 //band_pos_v |
| MOV x8, x20 //Loads pi1_sao_offset_v |
| |
| LD1 {v11.8b},[x14],#8 //band_table_v.val[2] |
| ADD v13.8b, v9.8b , v30.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v) |
| |
| LD1 {v12.8b},[x14],#8 //band_table_v.val[3] |
| ADD v14.8b, v10.8b , v30.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v) |
| |
| LD1 {v25.8b},[x8] //pi1_sao_offset_v load |
| ADD v15.8b, v11.8b , v30.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v) |
| |
| dup v29.8b, v25.8b[1] //vdup_n_u8(pi1_sao_offset_v[1]) |
| ADD v16.8b, v12.8b , v30.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v) |
| |
| dup v28.8b, v25.8b[2] //vdup_n_u8(pi1_sao_offset_v[2]) |
| ADD v9.8b, v13.8b , v29.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1])) |
| |
| dup v27.8b, v25.8b[3] //vdup_n_u8(pi1_sao_offset_v[3]) |
| ADD v10.8b, v14.8b , v28.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2])) |
| |
| dup v26.8b, v25.8b[4] //vdup_n_u8(pi1_sao_offset_v[4]) |
| ADD v11.8b, v15.8b , v27.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3])) |
| |
| movi v29.8b, #16 //vdup_n_u8(16) |
| ADD v12.8b, v16.8b , v26.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4])) |
| AND x12,x9,#0xf |
| |
| CMP x6,#28 |
| BLT SAO_BAND_POS_V_0 |
| |
| SAO_BAND_POS_V_28: //case 28 |
| cmhs v17.8b, v29.8b , v12.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_V_29 |
| ORR v12.8b, v12.8b , v17.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) |
| B SWITCH_BREAK_V |
| |
| SAO_BAND_POS_V_29: //case 29 |
| CMP x6,#29 |
| |
| cmhs v18.8b, v29.8b , v11.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_V_30 |
| ORR v11.8b, v11.8b , v18.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) |
| |
| AND v12.8b, v12.8b , v17.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) |
| B SWITCH_BREAK_V |
| |
| SAO_BAND_POS_V_30: //case 30 |
| CMP x6,#30 |
| |
| cmhs v19.8b, v29.8b , v10.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) |
| BNE SAO_BAND_POS_V_31 |
| ORR v10.8b, v10.8b , v19.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) |
| |
| AND v11.8b, v11.8b , v18.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) |
| B SWITCH_BREAK_V |
| |
| SAO_BAND_POS_V_31: //case 31 |
| CMP x6,#31 |
| BNE SWITCH_BREAK_V |
| |
| cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| ORR v9.8b, v9.8b , v20.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) |
| |
| AND v10.8b, v10.8b , v19.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) |
| B SWITCH_BREAK_V |
| |
| SAO_BAND_POS_V_0: |
| CMP x6,#0 //case 0 |
| BNE SWITCH_BREAK_V |
| |
| cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) |
| AND v9.8b, v9.8b , v20.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) |
| |
| SWITCH_BREAK_V: |
| CMP x9,#16 |
| MOV x4,x0 //pu1_src_cpy |
| mov v1.d[1],v2.d[0] |
| mov v2.d[0],v3.d[0] |
| mov v2.d[1],v4.d[0] |
| mov v9.d[1],v10.d[0] |
| mov v10.d[0],v11.d[0] |
| mov v10.d[1],v12.d[0] |
| BLT WIDTH_RESIDUE |
| |
| WIDTH_LOOP: //Width is assigned to be multiple of 16 |
| MOV x4,x0 //pu1_src_cpy |
| MOV x11,x10 //move ht |
| ADD x5,x4,x1 |
| |
| HEIGHT_LOOP: //unrolled for 4 rows |
| |
| ADD x6,x5,x1 |
| LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy) |
| ADD x7,x6,x1 |
| |
| LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy) |
| SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy) |
| SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy) |
| SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| ST2 {v5.8b, v6.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| SUBS x11,x11,#4 //Decrement the ht loop count by 4 |
| TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| |
| ST2 {v13.8b, v14.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| |
| TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| |
| ST2 {v17.8b, v18.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| |
| ADD x4,x6,x1 |
| ST2 {v21.8b, v22.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| ADD x5,x4,x1 |
| |
| BNE HEIGHT_LOOP |
| |
| SUB x9,x9,#16 //Decrement the width loop by 16 |
| ADD x0,x0,#16 |
| CMP x9,#8 |
| BGT WIDTH_LOOP |
| BLT END_LOOP |
| MOV x4,x0 //pu1_src_cpy |
| |
| WIDTH_RESIDUE: //If width is not multiple of 16 |
| |
| ADD x5,x4,x1 |
| LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy) |
| ADD x6,x5,x1 |
| |
| ADD x7,x6,x1 |
| LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy) |
| SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy) |
| SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy) |
| SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| ZIP1 v28.8b, v5.8b, v6.8b |
| ZIP2 v6.8b, v5.8b, v6.8b |
| mov v5.8b, v28.8b |
| |
| TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u) |
| |
| ST1 {v5.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| ZIP1 v28.8b, v13.8b, v14.8b |
| ZIP2 v14.8b, v13.8b, v14.8b |
| mov v13.8b, v28.8b |
| |
| TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v) |
| |
| ST1 {v13.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| SUBS x10,x10,#4 //Decrement the ht loop count by 4 |
| |
| TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) |
| ZIP1 v28.8b, v17.8b, v18.8b |
| ZIP2 v18.8b, v17.8b, v18.8b |
| mov v17.8b, v28.8b |
| |
| TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) |
| ST1 {v17.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| ZIP1 v28.8b, v21.8b, v22.8b |
| ZIP2 v22.8b, v21.8b, v22.8b |
| mov v21.8b, v28.8b |
| |
| ADD x4,x6,x1 |
| ST1 {v21.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row) |
| ADD x5,x4,x1 |
| |
| BNE WIDTH_RESIDUE |
| |
| END_LOOP: |
| // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP |
| ldp x23, x24,[sp],#16 |
| ldp x21, x22,[sp],#16 |
| ldp x19, x20,[sp],#16 |
| pop_v_regs |
| ret |
| |
| |
| |