| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ih264e_half_pel.s |
| // * |
| // * @brief |
| // * |
| // * |
| // * @author |
| // * Ittiam |
| // * |
| // * @par List of Functions: |
| // * ih264e_sixtapfilter_horz |
| // * ih264e_sixtap_filter_2dvh_vert |
| // |
| // * |
| // * @remarks |
| // * None |
| // * |
| // ******************************************************************************* |
| // */ |
| |
| |
| .text |
| .p2align 2 |
| .include "ih264_neon_macros.s" |
| |
| ///******************************************************************************* |
| //* |
| //* @brief |
| //* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) |
| //* |
| //* @par Description: |
| //* Applies a 6 tap horizontal filter .The output is clipped to 8 bits |
| //* sec 8.4.2.2.1 titled "Luma sample interpolation process" |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[out] pu1_dst |
| //* UWORD8 pointer to the destination |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, |
| // UWORD8 *pu1_dst, |
| // WORD32 src_strd, |
| // WORD32 dst_strd); |
| |
| |
| .equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time) |
| |
| |
| .global ih264e_sixtapfilter_horz_av8 |
| ih264e_sixtapfilter_horz_av8: |
| // STMFD sp!,{x14} |
| push_v_regs |
| sxtw x2, w2 |
| sxtw x3, w3 |
| stp x19, x20, [sp, #-16]! |
| |
| movi v0.8b, #5 |
| sub x0, x0, #2 |
| sub x3, x3, #16 |
| movi v1.8b, #20 |
| mov x14, #16 |
| |
| filter_horz_loop: |
| |
| |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 |
| |
| //// Processing row0 and row1 |
| |
| ext v31.8b, v2.8b , v3.8b , #5 |
| ext v30.8b, v3.8b , v4.8b , #5 |
| |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) |
| ext v29.8b, v4.8b , v4.8b , #5 |
| uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) |
| ext v28.8b, v5.8b , v6.8b , #5 |
| uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0) |
| ext v27.8b, v6.8b , v7.8b , #5 |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) |
| ext v26.8b, v7.8b , v7.8b , #5 |
| |
| uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) |
| ext v31.8b, v2.8b , v3.8b , #2 |
| uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1) |
| ext v30.8b, v3.8b , v4.8b , #2 |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| ext v29.8b, v4.8b , v4.8b , #2 |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| ext v28.8b, v5.8b , v6.8b , #2 |
| umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| ext v27.8b, v6.8b , v7.8b , #2 |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) |
| ext v26.8b, v7.8b , v7.8b , #2 |
| |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) |
| ext v31.8b, v2.8b , v3.8b , #3 |
| umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1) |
| ext v30.8b, v3.8b , v4.8b , #3 |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| ext v29.8b, v4.8b , v4.8b , #3 |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| ext v28.8b, v5.8b , v6.8b , #3 |
| umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| ext v27.8b, v6.8b , v7.8b , #3 |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| ext v26.8b, v7.8b , v7.8b , #3 |
| |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) |
| ext v31.8b, v2.8b , v3.8b , #1 |
| umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1) |
| ext v30.8b, v3.8b , v4.8b , #1 |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| ext v29.8b, v4.8b , v4.8b , #1 |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| ext v28.8b, v5.8b , v6.8b , #1 |
| umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| ext v27.8b, v6.8b , v7.8b , #1 |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| ext v26.8b, v7.8b , v7.8b , #1 |
| |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) |
| ext v31.8b, v2.8b , v3.8b , #4 |
| umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) |
| ext v30.8b, v3.8b , v4.8b , #4 |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| ext v29.8b, v4.8b , v4.8b , #4 |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| ext v28.8b, v5.8b , v6.8b , #4 |
| umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| ext v27.8b, v6.8b , v7.8b , #4 |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| ext v26.8b, v7.8b , v7.8b , #4 |
| |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) |
| umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) |
| |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) |
| sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) |
| |
| st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0 |
| st1 {v22.h}[0], [x1], x3 |
| st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1 |
| st1 {v25.h}[0], [x1], x3 |
| |
| subs x14, x14, #2 // decrement counter |
| |
| bne filter_horz_loop |
| |
| |
| // LDMFD sp!,{pc} |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* This function implements a two stage cascaded six tap filter. It |
| //* applies the six tap filter in the vertical direction on the |
| //* predictor values, followed by applying the same filter in the |
| //* horizontal direction on the output of the first stage. The six tap |
| //* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample |
| //* interpolation process" |
| //* (Filter run for width = 17 and height =17) |
| //* @par Description: |
| //* The function interpolates |
| //* the predictors first in the vertical direction and then in the |
| //* horizontal direction to output the (1/2,1/2). The output of the first |
| //* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) |
| //* in 16 bit precision. |
| //* |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[out] pu1_dst1 |
| //* UWORD8 pointer to the destination(vertical filtered output) |
| //* |
| //* @param[out] pu1_dst2 |
| //* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] dst_strd |
| //* integer destination stride of pu1_dst |
| //* |
| //* @param[in]pi16_pred1 |
| //* Pointer to 16bit intermediate buffer(used only in c) |
| //* |
| //* @param[in] pi16_pred1_strd |
| //* integer destination stride of pi16_pred1 |
| //* |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, |
| // UWORD8 *pu1_dst1, |
| // UWORD8 *pu1_dst2, |
| // WORD32 src_strd, |
| // WORD32 dst_strd, |
| // WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ |
| // WORD32 pi16_pred1_strd) |
| |
| |
| |
| |
| .global ih264e_sixtap_filter_2dvh_vert_av8 |
| |
| ih264e_sixtap_filter_2dvh_vert_av8: |
| // STMFD sp!,{x10,x11,x12,x14} |
| push_v_regs |
| sxtw x3, w3 |
| sxtw x4, w4 |
| stp x19, x20, [sp, #-16]! |
| |
| ////x0 - pu1_ref |
| ////x3 - u4_ref_width |
| |
| //// Load six rows for vertical interpolation |
| lsl x12, x3, #1 |
| sub x0, x0, x12 |
| sub x0, x0, #2 |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 |
| ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 |
| mov x12, #5 |
| ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 |
| mov x14, #20 |
| ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 |
| mov v0.h[0], w12 |
| mov v0.h[1], w14 |
| ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 |
| movi v1.8b, #20 |
| |
| //// x12 - u2_buff1_width |
| //// x14 - u2_buff2_width |
| mov x12, x4 |
| add x11, x1, #16 |
| |
| mov x14, x12 |
| |
| mov x10, #3 //loop counter |
| sub x16 , x12, #8 |
| sub x19, x14, #16 |
| filter_2dvh_loop: |
| |
| //// ////////////// ROW 1 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| |
| uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| ext v30.8b, v20.8b , v21.8b , #4 |
| mov v23.d[0], v22.d[1] |
| |
| |
| uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| mov v21.d[0], v20.d[1] |
| ext v2.8b, v2.8b , v3.8b , #2 |
| ext v3.8b, v3.8b , v4.8b , #2 |
| ext v4.8b, v4.8b , v4.8b , #2 |
| |
| st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v2.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v2.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v2.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v2.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| |
| ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 |
| ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 |
| |
| ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values |
| //// ////////////// ROW 2 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| mov v23.d[0], v22.d[1] |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| ext v30.8b, v20.8b , v21.8b , #4 |
| |
| uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| |
| ext v5.8b, v5.8b , v6.8b , #2 |
| ext v6.8b, v6.8b , v7.8b , #2 |
| ext v7.8b, v7.8b , v7.8b , #2 |
| |
| st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v6.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v6.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v6.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v6.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| |
| ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 |
| ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 |
| |
| ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values |
| //// ////////////// ROW 3 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| mov v23.d[0], v22.d[1] |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| ext v30.8b, v20.8b , v21.8b , #4 |
| |
| uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| |
| ext v8.8b, v8.8b , v9.8b , #2 |
| ext v9.8b, v9.8b , v10.8b , #2 |
| ext v10.8b, v10.8b , v10.8b , #2 |
| |
| st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v8.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v8.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v8.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v8.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| |
| ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 |
| ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 |
| |
| ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values |
| //// ////////////// ROW 4 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| mov v23.d[0], v22.d[1] |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| ext v30.8b, v20.8b , v21.8b , #4 |
| |
| uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| |
| ext v11.8b, v11.8b , v12.8b , #2 |
| ext v12.8b, v12.8b , v13.8b , #2 |
| ext v13.8b, v13.8b , v13.8b , #2 |
| |
| st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v12.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v12.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v12.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v12.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| |
| ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 |
| ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 |
| |
| ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values |
| //// ////////////// ROW 5 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| mov v23.d[0], v22.d[1] |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| ext v30.8b, v20.8b , v21.8b , #4 |
| |
| uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| |
| ext v14.8b, v14.8b , v15.8b , #2 |
| ext v15.8b, v15.8b , v16.8b , #2 |
| ext v16.8b, v16.8b , v16.8b , #2 |
| |
| st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v14.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v14.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v14.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v14.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| |
| ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 |
| ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 |
| |
| ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values |
| //// ////////////// ROW 6 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| |
| cmp x10, #1 //// if it 17 rows are complete skip |
| beq filter_2dvh_skip_row |
| uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0) |
| movi v31.8b, #5 |
| umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| mov v21.d[0], v20.d[1] |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0) |
| umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| mov v23.d[0], v22.d[1] |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| ext v30.8b, v20.8b , v21.8b , #4 |
| |
| uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0) |
| ext v29.8b, v20.8b , v21.8b , #6 |
| umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) |
| umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) |
| umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) |
| umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) |
| mov v25.d[0], v24.d[1] |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v21.8b , v22.8b , #2 |
| sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v28.8b, v20.8b , v21.8b , #2 |
| |
| saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) |
| ext v31.8b, v22.8b , v23.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) |
| smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) |
| ext v30.8b, v21.8b , v22.8b , #4 |
| |
| sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) |
| ext v29.8b, v21.8b , v22.8b , #6 |
| |
| ext v28.8b, v21.8b , v22.8b , #2 |
| saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) |
| smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) |
| smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) |
| smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) |
| smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) |
| ext v31.8b, v23.8b , v24.8b , #2 |
| |
| ext v17.8b, v17.8b , v18.8b , #2 |
| ext v18.8b, v18.8b , v19.8b , #2 |
| ext v19.8b, v19.8b , v19.8b , #2 |
| |
| st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid |
| st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid |
| |
| ext v30.8b, v22.8b , v23.8b , #4 |
| ext v29.8b, v22.8b , v23.8b , #6 |
| |
| saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3) |
| ext v28.8b, v22.8b , v23.8b , #2 |
| smlal v18.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) |
| smlal v18.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) |
| smlsl v18.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) |
| smlsl v18.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) |
| ext v31.8b, v24.8b , v25.8b , #2 |
| |
| shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) |
| ext v30.8b, v23.8b , v24.8b , #4 |
| shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) |
| ext v29.8b, v23.8b , v24.8b , #6 |
| |
| saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) |
| ext v28.8b, v23.8b , v24.8b , #2 |
| ext v31.8b, v25.8b , v25.8b , #2 |
| smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) |
| smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) |
| smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) |
| smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) |
| ext v30.8b, v24.8b , v25.8b , #4 |
| |
| saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) |
| ext v29.8b, v24.8b , v25.8b , #6 |
| |
| ext v31.8b, v24.8b , v25.8b , #2 |
| shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) |
| |
| ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data |
| smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) |
| smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) |
| smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) |
| smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) |
| shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) |
| mov v20.d[1], v21.d[0] |
| sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 |
| |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| |
| subs x10, x10, #1 ////decrement loop counter |
| |
| bne filter_2dvh_loop |
| |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| //// ////////////// ROW 13 /////////////////////// |
| |
| //// Process first vertical interpolated row |
| //// each column is |
| |
| // LDMFD sp!,{x10,x11,x12,pc} |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| filter_2dvh_skip_row: |
| mov v28.d[1], v29.d[0] |
| sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 |
| shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) |
| |
| sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 |
| |
| st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values |
| st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values |
| // LDMFD sp!,{x10,x11,x12,pc} |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| ///***************************************** |