| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| //****************************************************************************** |
| //* @file |
| //* ih264_inter_pred_luma_vert_av8.s |
| //* |
| //* @brief |
| //* Contains function definitions for inter prediction interpolation. |
| //* |
| //* @author |
| //* Ittiam |
| //* |
| //* @par List of Functions: |
| //* |
| //* - ih264_inter_pred_luma_vert_av8() |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| |
| ///* All the functions here are replicated from ih264_inter_pred_filters.c |
| // |
| |
| ///** |
| ///** |
| ///** |
| // ******************************************************************************* |
| // * |
| // * @brief |
| // * Interprediction luma filter for vertical input |
| // * |
| // * @par Description: |
| // * Applies a 6 tap vertcal filter.The output is clipped to 8 bits |
| // * sec 8.4.2.2.1 titled "Luma sample interpolation process" |
| // * |
| // * @param[in] pu1_src |
| // * UWORD8 pointer to the source |
| // * |
| // * @param[out] pu1_dst |
| // * UWORD8 pointer to the destination |
| // * |
| // * @param[in] src_strd |
| // * integer source stride |
| // * |
| // * @param[in] dst_strd |
| // * integer destination stride |
| // * |
| // * @param[in] ht |
| // * integer height of the array |
| // * |
| // * @param[in] wd |
| // * integer width of the array |
| // * |
| // * @returns |
| // * |
| // * @remarks |
| // * None |
| // * |
| // ******************************************************************************* |
| |
| //void ih264_inter_pred_luma_vert ( |
| // UWORD8 *pu1_src, |
| // UWORD8 *pu1_dst, |
| // WORD32 src_strd, |
| // WORD32 dst_strd, |
| // WORD32 ht, |
| // WORD32 wd ) |
| |
| //**************Variables Vs Registers***************************************** |
| // x0 => *pu1_src |
| // x1 => *pu1_dst |
| // x2 => src_strd |
| // x3 => dst_strd |
| // x4 => ht |
| // x5 => wd |
| |
| .text |
| .p2align 2 |
| .include "ih264_neon_macros.s" |
| |
| |
| |
| |
| .global ih264_inter_pred_luma_vert_av8 |
| |
| ih264_inter_pred_luma_vert_av8: |
| |
| // STMFD sp!, {x4-x12, x14} //store register values to stack |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd |
| |
| sub x14, x4, #16 |
| movi v22.8h, #20 // Filter coeff 0x14 into Q11 |
| |
| subs x12, x5, #8 //if wd=8 branch to loop_8 |
| movi v24.8h, #5 // Filter coeff 0x4 into Q12 |
| beq loop_8_start |
| |
| subs x12, x5, #4 //if wd=4 branch to loop_4 |
| beq loop_4_start |
| |
| |
| ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] |
| ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] |
| ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] |
| ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] |
| add x14, x14, #1 //for checking loop |
| ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] |
| uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] |
| ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] |
| |
| loop_16: //when wd=16 |
| |
| uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] |
| uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] |
| mla v14.8h, v12.8h, v22.8h // temp += temp1 * 20 |
| uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] |
| uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] |
| mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 |
| ld1 {v0.2s, v1.2s}, [x0], x2 |
| uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] |
| uaddl v12.8h, v6.8b, v8.8b |
| mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v16.8h, v2.8b, v0.8b |
| uaddl v18.8h, v4.8b, v10.8b |
| mla v16.8h, v12.8h , v22.8h |
| mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 |
| uaddl v26.8h, v5.8b, v11.8b |
| uaddl v12.8h, v7.8b, v9.8b |
| sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) |
| uaddl v14.8h, v3.8b, v1.8b |
| ld1 {v2.2s, v3.2s}, [x0], x2 |
| mla v14.8h, v12.8h , v22.8h |
| mls v16.8h, v18.8h , v24.8h |
| sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) |
| uaddl v18.8h, v4.8b, v2.8b |
| uaddl v12.8h, v8.8b, v10.8b |
| |
| st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] |
| mla v18.8h, v12.8h , v22.8h |
| uaddl v20.8h, v6.8b, v0.8b |
| mls v14.8h, v26.8h , v24.8h |
| sqrshrun v30.8b, v16.8h, #5 |
| uaddl v12.8h, v9.8b, v11.8b |
| uaddl v16.8h, v5.8b, v3.8b |
| uaddl v26.8h, v7.8b, v1.8b |
| mla v16.8h, v12.8h , v22.8h |
| mls v18.8h, v20.8h , v24.8h |
| ld1 {v4.2s, v5.2s}, [x0], x2 |
| |
| sqrshrun v31.8b, v14.8h, #5 |
| uaddl v12.8h, v10.8b, v0.8b |
| uaddl v14.8h, v6.8b, v4.8b |
| uaddl v20.8h, v8.8b, v2.8b |
| mla v14.8h, v12.8h , v22.8h |
| mls v16.8h, v26.8h , v24.8h |
| st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 |
| sqrshrun v30.8b, v18.8h, #5 |
| uaddl v18.8h, v7.8b, v5.8b |
| uaddl v12.8h, v11.8b, v1.8b |
| mla v18.8h, v12.8h , v22.8h |
| uaddl v26.8h, v9.8b, v3.8b |
| mls v14.8h, v20.8h , v24.8h |
| ld1 {v6.2s, v7.2s}, [x0], x2 |
| sqrshrun v31.8b, v16.8h, #5 |
| mls v18.8h, v26.8h , v24.8h |
| uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] |
| st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 |
| uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] |
| uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] |
| sqrshrun v30.8b, v14.8h, #5 |
| uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] |
| uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] |
| sqrshrun v31.8b, v18.8h, #5 |
| mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 |
| uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] |
| st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 |
| // 4 rows processed |
| mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 |
| ld1 {v8.2s, v9.2s}, [x0], x2 |
| uaddl v12.8h, v2.8b, v4.8b |
| uaddl v18.8h, v3.8b, v5.8b |
| mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v28.8h, v9.8b, v11.8b |
| uaddl v16.8h, v6.8b, v0.8b |
| mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 |
| mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 |
| uaddl v26.8h, v1.8b, v7.8b |
| uaddl v18.8h, v5.8b, v7.8b |
| sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) |
| uaddl v14.8h, v8.8b, v10.8b |
| |
| sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) |
| ld1 {v10.2s, v11.2s}, [x0], x2 |
| mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 |
| st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 |
| mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 |
| uaddl v20.8h, v11.8b, v1.8b |
| uaddl v26.8h, v3.8b, v9.8b |
| mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 |
| uaddl v12.8h, v6.8b, v4.8b |
| uaddl v18.8h, v7.8b, v9.8b |
| sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) |
| mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v16.8h, v8.8b, v2.8b |
| sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) |
| mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 |
| uaddl v14.8h, v10.8b, v0.8b |
| st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 |
| mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v0.2s, v1.2s}, [x0], x2 |
| uaddl v26.8h, v5.8b, v11.8b |
| uaddl v12.8h, v8.8b, v6.8b |
| uaddl v28.8h, v0.8b, v2.8b |
| sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) |
| mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 |
| uaddl v20.8h, v1.8b, v3.8b |
| mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 |
| uaddl v16.8h, v10.8b, v4.8b |
| sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) |
| mov v2.8b, v6.8b |
| mov v3.8b, v7.8b |
| mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 |
| sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) |
| |
| swp v0.8b, v4.8b |
| swp v1.8b, v5.8b |
| |
| |
| |
| mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 |
| mov v6.8b, v10.8b |
| mov v7.8b, v11.8b |
| subs x12, x14, #1 // if height==16 - looping |
| |
| swp v4.8b, v8.8b |
| swp v5.8b, v9.8b |
| |
| |
| sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) |
| st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 |
| bne end_func //if height =8 end function |
| add x14, x14, #1 //for checking loop |
| ld1 {v10.2s, v11.2s}, [x0], x2 |
| uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] |
| |
| b loop_16 // looping if height =16 |
| |
| loop_8_start: |
| //// Processing row0 and row1 |
| |
| ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] |
| ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] |
| ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] |
| ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] |
| add x14, x14, #1 //for checking loop |
| ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] |
| ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] |
| |
| loop_8: |
| //for checking loop |
| uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] |
| uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] |
| mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v6.2s}, [x0], x2 |
| uaddl v14.8h, v3.8b, v4.8b |
| uaddl v16.8h, v1.8b, v6.8b |
| uaddl v18.8h, v2.8b, v5.8b |
| mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 |
| mla v16.8h, v14.8h , v22.8h |
| ld1 {v7.2s}, [x0], x2 |
| uaddl v20.8h, v4.8b, v5.8b |
| uaddl v12.8h, v2.8b, v7.8b |
| uaddl v10.8h, v3.8b, v6.8b |
| mls v16.8h, v18.8h , v24.8h |
| sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) |
| mla v12.8h, v20.8h , v22.8h |
| ld1 {v0.2s}, [x0], x2 |
| uaddl v14.8h, v5.8b, v6.8b |
| sqrshrun v27.8b, v16.8h, #5 |
| uaddl v20.8h, v3.8b, v0.8b |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] |
| uaddl v18.8h, v4.8b, v7.8b |
| mla v20.8h, v14.8h , v22.8h |
| st1 {v27.2s}, [x1], x3 |
| sqrshrun v28.8b, v12.8h, #5 |
| st1 {v28.2s}, [x1], x3 |
| mls v20.8h, v18.8h , v24.8h |
| ld1 {v1.2s}, [x0], x2 |
| sqrshrun v29.8b, v20.8h, #5 |
| subs x9, x4, #4 |
| st1 {v29.2s}, [x1], x3 //store row 3 |
| |
| |
| beq end_func // Branch if height==4 |
| |
| |
| uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] |
| uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] |
| mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v2.2s}, [x0], x2 |
| mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v8.8h, v0.8b, v7.8b |
| uaddl v10.8h, v1.8b, v6.8b |
| uaddl v12.8h, v2.8b, v5.8b |
| sqrshrun v26.8b, v18.8h, #5 |
| mla v12.8h, v8.8h , v22.8h |
| ld1 {v3.2s}, [x0], x2 |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.2s}, [x1], x3 |
| sqrshrun v27.8b, v12.8h, #5 |
| st1 {v27.2s}, [x1], x3 |
| uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] |
| uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] |
| mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v4.2s}, [x0], x2 |
| mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v8.8h, v2.8b, v1.8b |
| uaddl v10.8h, v3.8b, v0.8b |
| uaddl v12.8h, v4.8b, v7.8b |
| sqrshrun v26.8b, v18.8h, #5 |
| mla v12.8h, v8.8h , v22.8h |
| ld1 {v5.2s}, [x0], x2 |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.2s}, [x1], x3 |
| sqrshrun v27.8b, v12.8h, #5 |
| subs x12, x14, #1 |
| st1 {v27.2s}, [x1], x3 |
| add x14, x14, #1 |
| beq loop_8 //looping if height ==16 |
| |
| b end_func |
| |
| |
| loop_4_start: |
| //// Processing row0 and row1 |
| |
| |
| ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] |
| ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] |
| ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] |
| ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] |
| ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] |
| ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] |
| |
| uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] |
| uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] |
| mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v6.2s}, [x0], x2 |
| uaddl v14.8h, v3.8b, v4.8b |
| uaddl v16.8h, v1.8b, v6.8b |
| uaddl v18.8h, v2.8b, v5.8b |
| mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 |
| ld1 {v7.s}[0], [x0], x2 |
| mla v16.8h, v14.8h , v22.8h |
| uaddl v20.8h, v4.8b, v5.8b |
| uaddl v12.8h, v2.8b, v7.8b |
| uaddl v10.8h, v3.8b, v6.8b |
| mls v16.8h, v18.8h , v24.8h |
| sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) |
| mla v12.8h, v20.8h , v22.8h |
| ld1 {v0.s}[0], [x0], x2 |
| uaddl v14.8h, v5.8b, v6.8b |
| sqrshrun v27.8b, v16.8h, #5 |
| uaddl v20.8h, v3.8b, v0.8b |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] |
| uaddl v18.8h, v4.8b, v7.8b |
| mla v20.8h, v14.8h , v22.8h |
| st1 {v27.s}[0], [x1], x3 |
| sqrshrun v28.8b, v12.8h, #5 |
| st1 {v28.s}[0], [x1], x3 |
| mls v20.8h, v18.8h , v24.8h |
| ld1 {v1.s}[0], [x0], x2 |
| sqrshrun v29.8b, v20.8h, #5 |
| st1 {v29.s}[0], [x1], x3 //store row 3 |
| |
| subs x9, x4, #4 |
| beq end_func // Branch if height==4 |
| |
| |
| uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] |
| uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] |
| mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v2.s}[0], [x0], x2 |
| mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v8.8h, v0.8b, v7.8b |
| uaddl v10.8h, v1.8b, v6.8b |
| uaddl v12.8h, v2.8b, v5.8b |
| sqrshrun v26.8b, v18.8h, #5 |
| mla v12.8h, v8.8h , v22.8h |
| ld1 {v3.s}[0], [x0], x2 |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.s}[0], [x1], x3 |
| sqrshrun v27.8b, v12.8h, #5 |
| st1 {v27.s}[0], [x1], x3 |
| uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] |
| uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] |
| uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] |
| mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 |
| ld1 {v4.s}[0], [x0], x2 |
| mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 |
| uaddl v8.8h, v2.8b, v1.8b |
| uaddl v10.8h, v3.8b, v0.8b |
| uaddl v12.8h, v4.8b, v7.8b |
| sqrshrun v26.8b, v18.8h, #5 |
| mla v12.8h, v8.8h , v22.8h |
| ld1 {v5.s}[0], [x0], x2 |
| mls v12.8h, v10.8h , v24.8h |
| st1 {v26.s}[0], [x1], x3 |
| sqrshrun v27.8b, v12.8h, #5 |
| st1 {v27.s}[0], [x1], x3 |
| |
| |
| end_func: |
| // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |