| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| //****************************************************************************** |
| //* @file |
| //* ih264_inter_pred_luma_horz_av8.s |
| //* |
| //* @brief |
| //* Contains function definitions for inter prediction interpolation. |
| //* |
| //* @author |
| //* Ittiam |
| //* |
| //* @par List of Functions: |
| //* |
| //* - ih264_inter_pred_luma_horz_av8() |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| |
| ///* All the functions here are replicated from ih264_inter_pred_filters.c |
| // |
| |
| ///** |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* Interprediction luma filter for horizontal input |
| //* |
| //* @par Description: |
| //* Applies a 6 tap horizontal filter .The output is clipped to 8 bits |
| //* sec 8.4.2.2.1 titled "Luma sample interpolation process" |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[out] pu1_dst |
| //* UWORD8 pointer to the destination |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| // @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| |
| //void ih264_inter_pred_luma_horz ( |
| // UWORD8 *pu1_src, |
| // UWORD8 *pu1_dst, |
| // WORD32 src_strd, |
| // WORD32 dst_strd, |
| // WORD32 ht, |
| // WORD32 wd ) |
| |
| //**************Variables Vs Registers***************************************** |
| // x0 => *pu1_src |
| // x1 => *pu1_dst |
| // x2 => src_strd |
| // x3 => dst_strd |
| // x4 => ht |
| // x5 => wd |
| |
| .text |
| .p2align 2 |
| |
| .include "ih264_neon_macros.s" |
| |
| |
| |
| .global ih264_inter_pred_luma_horz_av8 |
| |
| ih264_inter_pred_luma_horz_av8: |
| |
| |
| |
| |
| // STMFD sp!, {x4-x12, x14} //store register values to stack |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| sub x0, x0, #2 //pu1_src-2 |
| sub x14, x4, #16 |
| movi v0.8b, #5 //filter coeff |
| subs x12, x5, #8 //if wd=8 branch to loop_8 |
| movi v1.8b, #20 //filter coeff |
| beq loop_8 |
| |
| subs x12, x5, #4 //if wd=4 branch to loop_4 |
| beq loop_4 |
| |
| loop_16: //when wd=16 |
| //// Processing row0 and row1 |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 |
| add x14, x14, #1 //for checking loop |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 |
| ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row0) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) |
| uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) |
| ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row1) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) |
| uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) |
| ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row0) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) |
| ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row1) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) |
| ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row0) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row1) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) |
| ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row0) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row1) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) |
| ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row0) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row1) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) |
| |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 |
| sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) |
| st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row2) |
| sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) |
| |
| |
| |
| //// Processing row2 and row3 |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) |
| st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row1 |
| uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) |
| ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row3) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) |
| ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) |
| uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) |
| ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row2) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) |
| ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row3) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) |
| ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) |
| ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) |
| ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row2) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) |
| ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row3) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) |
| ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row3) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) |
| ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) |
| ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row2) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) |
| ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) |
| ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row3) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) |
| ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) |
| ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row2) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) |
| ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) |
| ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row3) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) |
| |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 |
| sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) |
| st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) |
| ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row4) |
| sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) |
| |
| |
| //// Processing row4 and row5 |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) |
| st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row3 |
| uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) |
| ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row5) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) |
| ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) |
| uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) |
| ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row4) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) |
| ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row5) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) |
| ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) |
| ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) |
| ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row4) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) |
| ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) |
| ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row5) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) |
| ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) |
| ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row4) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) |
| ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) |
| ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row5) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) |
| ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) |
| ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row4) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) |
| ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) |
| ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row5) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) |
| ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) |
| |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) |
| ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 |
| sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) |
| st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) |
| ext v30.8b, v3.8b , v4.8b, #5 ////extract a[5] (column2,row6) |
| sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) |
| |
| |
| |
| //// Processing row6 and row7 |
| |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) |
| st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row5 |
| uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) |
| ext v27.8b, v6.8b , v7.8b, #5 ////extract a[5] (column2,row7) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) |
| ext v31.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) |
| uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) |
| ext v30.8b, v3.8b , v4.8b, #2 ////extract a[2] (column2,row6) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) |
| ext v27.8b, v6.8b , v7.8b, #2 ////extract a[2] (column2,row7) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) |
| ext v28.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) |
| ext v31.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) |
| ext v30.8b, v3.8b , v4.8b, #3 ////extract a[3] (column2,row6) |
| umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) |
| ext v28.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) |
| umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) |
| ext v27.8b, v6.8b , v7.8b, #3 ////extract a[3] (column2,row7) |
| umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) |
| ext v31.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) |
| umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) |
| ext v30.8b, v3.8b , v4.8b, #1 ////extract a[1] (column2,row6) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) |
| ext v28.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) |
| ext v27.8b, v6.8b , v7.8b, #1 ////extract a[1] (column2,row7) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) |
| ext v31.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) |
| ext v30.8b, v3.8b , v4.8b, #4 ////extract a[4] (column2,row6) |
| umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) |
| ext v28.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) |
| umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) |
| ext v27.8b, v6.8b , v7.8b, #4 ////extract a[4] (column2,row6) |
| |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) |
| umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) |
| sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) |
| umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) |
| st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 |
| sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) |
| subs x12, x14, #1 // if height==16 - looping |
| st1 {v23.8b, v24.8b}, [x1], x3 ////Store dest row7 |
| |
| |
| |
| beq loop_16 |
| b end_func |
| |
| |
| |
| loop_8: |
| //// Processing row0 and row1 |
| |
| |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 |
| add x14, x14, #1 //for checking loop |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| |
| //// Processing row2 and row3 |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) |
| st1 {v23.8b}, [x1], x3 ////Store dest row0 |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) |
| st1 {v20.8b}, [x1], x3 ////Store dest row1 |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 |
| subs x9, x4, #4 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row5) |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row5) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row4) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row5) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row5) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row4) |
| st1 {v20.8b}, [x1], x3 ////Store dest row2 |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row4) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) |
| st1 {v23.8b}, [x1], x3 ////Store dest row3 |
| beq end_func // Branch if height==4 |
| |
| //// Processing row4 and row5 |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row5) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row4) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row4) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row6) |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row7) |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row7) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row7) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row7) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row6) |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row6) |
| st1 {v20.8b}, [x1], x3 ////Store dest row4 |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row6) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row6) |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) |
| //// Processing row6 and row7 |
| st1 {v23.8b}, [x1], x3 ////Store dest row5 |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row7) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) |
| subs x12, x14, #1 |
| st1 {v20.8b}, [x1], x3 ////Store dest row6 |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) |
| st1 {v23.8b}, [x1], x3 ////Store dest row7 |
| |
| beq loop_8 //looping if height ==16 |
| |
| b end_func |
| loop_4: |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row1) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row1) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row0) |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row1) |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row1) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row1) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row0) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row0) |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row0) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row0) |
| ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 |
| ext v28.8b, v5.8b , v6.8b, #5 ////extract a[5] (column1,row3) |
| ext v25.8b, v5.8b , v6.8b, #2 ////extract a[2] (column1,row3) |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| ext v31.8b, v2.8b , v3.8b, #5 ////extract a[5] (column1,row2) |
| ext v24.8b, v5.8b , v6.8b, #3 ////extract a[3] (column1,row2) |
| st1 {v23.s}[0], [x1], x3 ////Store dest row0 |
| ext v23.8b, v5.8b , v6.8b, #1 ////extract a[1] (column1,row3) |
| ext v22.8b, v5.8b , v6.8b, #4 ////extract a[4] (column1,row3) |
| ext v29.8b, v2.8b , v3.8b, #3 ////extract a[3] (column1,row2) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| ext v30.8b, v2.8b , v3.8b, #2 ////extract a[2] (column1,row2) |
| ext v27.8b, v2.8b , v3.8b, #1 ////extract a[1] (column1,row2) |
| |
| //// Processing row2 and row3 |
| st1 {v20.s}[0], [x1], x3 ////Store dest row1 |
| uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) |
| ext v26.8b, v2.8b , v3.8b, #4 ////extract a[4] (column1,row2) |
| umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) |
| umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) |
| umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) |
| umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) |
| uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) |
| umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) |
| umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) |
| umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) |
| umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) |
| sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) |
| sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) |
| st1 {v20.s}[0], [x1], x3 ////Store dest row2 |
| subs x4, x4, #8 // Loop if height =8 |
| st1 {v23.s}[0], [x1], x3 ////Store dest row3 |
| beq loop_4 |
| |
| end_func: |
| // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |