| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| //******************************************************************************* |
| //* //file |
| //* ihevc_inter_pred_chroma_vert_w16out_neon.s |
| //* |
| //* //brief |
| //* contains function definitions for inter prediction interpolation. |
| //* functions are coded using neon intrinsics and can be compiled using |
| |
| //* rvct |
| //* |
| //* //author |
| //* yogeswaran rs/ pathiban |
| //* |
| //* //par list of functions: |
| //* |
| //* |
| //* //remarks |
| //* none |
| //* |
| //******************************************************************************* |
| //*/ |
| ///** |
| ///** |
| //******************************************************************************* |
| //* |
| //* //brief |
| //* interprediction chroma filter to store vertical 16bit ouput |
| //* |
| //* //par description: |
| //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to |
| //* the elements pointed by 'pu1_src' and writes to the location pointed by |
| //* 'pu1_dst' no downshifting or clipping is done and the output is used as |
| //* an input for weighted prediction assumptions : the function is optimized |
| //* considering the fact width is multiple of 2,4 or 8. and also considering |
| //* height should be multiple of 2. width 4,8 is optimized further |
| //* |
| //* //param[in] pu1_src |
| //* uword8 pointer to the source |
| //* |
| //* //param[out] pi2_dst |
| //* word16 pointer to the destination |
| //* |
| //* //param[in] src_strd |
| //* integer source stride |
| //* |
| //* //param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* //param[in] pi1_coeff |
| //* word8 pointer to the filter coefficients |
| //* |
| //* //param[in] ht |
| //* integer height of the array |
| //* |
| //* //param[in] wd |
| //* integer width of the array |
| //* |
| //* //returns |
| //* |
| //* //remarks |
| //* none |
| //* |
| //***************************************************************************** |
| //*/ |
| //void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src, |
| // word16 *pi2_dst, |
| // word32 src_strd, |
| // word32 dst_strd, |
| // word8 *pi1_coeff, |
| // word32 ht, |
| // word32 wd) |
| //**************variables vs registers***************************************** |
| //x0 => *pu1_src |
| //x1 => *pi2_dst |
| //x2 => src_strd |
| //x3 => dst_strd |
| |
| .text |
| .align 4 |
| |
| .include "ihevc_neon_macros.s" |
| |
| .globl ihevc_inter_pred_chroma_vert_w16out_av8 |
| |
| .type ihevc_inter_pred_chroma_vert_w16out_av8, %function |
| |
| ihevc_inter_pred_chroma_vert_w16out_av8: |
| |
| // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments |
| |
| stp x19, x20,[sp,#-16]! |
| |
| mov x15,x4 // pi1_coeff |
| mov x16,x5 // ht |
| mov x17,x6 // wd |
| |
| |
| mov x4,x16 //loads ht |
| mov x12,x15 //loads pi1_coeff |
| cmp x4,#0 //checks ht == 0 |
| mov x6,x17 //loads wd |
| sub x0,x0,x2 //pu1_src - src_strd |
| ld1 {v0.8b},[x12] //loads pi1_coeff |
| |
| ble end_loops //jumps to end |
| |
| tst x6,#3 //checks (wd & 3) |
| abs v3.8b, v0.8b //vabs_s8(coeff) |
| lsl x10,x6,#1 //2*wd |
| dup v0.8b, v3.b[0] //coeffabs_0 |
| dup v1.8b, v3.b[1] //coeffabs_1 |
| dup v2.8b, v3.b[2] //coeffabs_2 |
| dup v3.8b, v3.b[3] //coeffabs_3 |
| |
| bgt outer_loop_wd_2 //jumps to loop handling wd ==2 |
| |
| tst x4,#7 //checks ht for mul of 8 |
| beq core_loop_ht_8 //when height is multiple of 8 |
| |
| lsl x7,x3,#2 //2*dst_strd |
| sub x9,x7,x10,lsl #1 //4*dst_strd - 4wd |
| lsl x12,x2,#1 //2*src_strd |
| sub x8,x12,x10 //2*src_strd - 2wd |
| lsl x3, x3, #1 |
| mov x5,x10 //2wd |
| |
| inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 |
| |
| add x6,x0,x2 //pu1_src +src_strd |
| ld1 {v17.8b},[x6],x2 //loads pu1_src |
| subs x5,x5,#8 //2wd - 8 |
| ld1 {v5.8b},[x0],#8 //loads src |
| umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) |
| ld1 {v4.8b},[x6],x2 //loads incremented src |
| umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) |
| ld1 {v16.8b},[x6],x2 //loads incremented src |
| umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) |
| umull v4.8h, v4.8b, v1.8b |
| ld1 {v18.8b},[x6] //loads the incremented src |
| umlsl v6.8h, v16.8b, v3.8b |
| umlsl v4.8h, v17.8b, v0.8b |
| umlal v4.8h, v16.8b, v2.8b |
| umlsl v4.8h, v18.8b, v3.8b |
| add x6,x1,x3 //pu1_dst + dst_strd |
| st1 { v6.8h},[x1],#16 //stores the loaded value |
| |
| st1 { v4.8h},[x6] //stores the loaded value |
| |
| bgt inner_loop_ht_2 //inner loop again |
| |
| subs x4,x4,#2 //ht - 2 |
| add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd) |
| mov x5,x10 //2wd |
| add x0,x0,x8 //pu1_src += (2*src_strd - 2wd) |
| |
| bgt inner_loop_ht_2 //loop again |
| |
| b end_loops //jumps to end |
| |
| outer_loop_wd_2: //called when width is multiple of 2 |
| lsl x5,x3,#2 //2*dst_strd |
| mov x12,x10 //2wd |
| sub x9,x5,x10,lsl #1 //4*dst_strd - 4wd |
| lsl x7,x2,#1 //2*src_strd |
| sub x8,x7,x10 //2*src_strd - 2wd |
| |
| inner_loop_wd_2: |
| |
| add x6,x0,x2 //pu1_src + src_strd |
| ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0 |
| subs x12,x12,#4 //2wd - 4 |
| add x0,x0,#4 //pu1_src + 4 |
| ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp |
| dup v7.2s, v6.s[1] |
| ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp |
| umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) |
| dup v7.2s, v7.s[1] |
| ld1 {v7.s}[1],[x6],x2 |
| umlsl v4.8h, v6.8b, v0.8b |
| umlal v4.8h, v7.8b, v2.8b |
| dup v7.2s, v7.s[1] |
| ld1 {v7.s}[1],[x6] |
| add x6,x1,x3,lsl #1 //pu1_dst + dst_strd |
| umlsl v4.8h, v7.8b, v3.8b |
| st1 {v4.d}[0],[x1] //stores the loaded value |
| add x1,x1,#8 //pu1_dst += 4 |
| st1 {v4.d}[1],[x6] //stores the loaded value |
| |
| bgt inner_loop_wd_2 //inner loop again |
| |
| //inner loop ends |
| subs x4,x4,#2 //ht - 2 |
| add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd |
| mov x12,x10 //2wd |
| add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd |
| |
| bgt inner_loop_wd_2 //loop again |
| |
| b end_loops //jumps to end |
| |
| core_loop_ht_8: //when wd & ht is multiple of 8 |
| |
| lsl x12,x3,#3 //4*dst_strd |
| sub x8,x12,x10,lsl #1 //4*dst_strd - 2wd |
| lsl x12,x2,#2 //4*src_strd |
| sub x9,x12,x10 //4*src_strd - 2wd |
| |
| bic x5,x10,#7 //x5 ->wd |
| lsr x14, x10, #3 //divide by 8 |
| mul x12, x4 , x14 //multiply height by width |
| sub x12, x12,#4 //subtract by one for epilog |
| lsl x3, x3, #1 |
| |
| prolog: |
| add x6,x0,x2 //pu1_src + src_strd |
| ld1 {v5.8b},[x6],x2 //loads pu1_src |
| subs x5,x5,#8 //2wd - 8 |
| ld1 {v4.8b},[x0],#8 //loads the source |
| ld1 {v6.8b},[x6],x2 //load and increment |
| umull v30.8h, v5.8b, v1.8b //mul with coeff 1 |
| ld1 {v7.8b},[x6],x2 //load and increment |
| umlsl v30.8h, v4.8b, v0.8b |
| add x7,x1,x3 //pu1_dst |
| umlal v30.8h, v6.8b, v2.8b |
| umlsl v30.8h, v7.8b, v3.8b |
| ld1 {v16.8b},[x6],x2 //load and increment |
| |
| umull v28.8h, v6.8b, v1.8b //mul_res 2 |
| add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd |
| csel x0, x20, x0,le |
| umlsl v28.8h, v5.8b, v0.8b |
| bic x20,x10,#7 //x5 ->wd |
| csel x5, x20, x5,le |
| umlal v28.8h, v7.8b, v2.8b |
| ld1 {v17.8b},[x6],x2 |
| umlsl v28.8h, v16.8b, v3.8b |
| |
| ld1 {v18.8b},[x6],x2 |
| umull v26.8h, v7.8b, v1.8b |
| add x6,x0,x2 //pu1_src + src_strd |
| umlsl v26.8h, v6.8b, v0.8b |
| st1 { v30.16b},[x1],#16 //stores the loaded value |
| umlal v26.8h, v16.8b, v2.8b |
| ld1 {v4.8b},[x0],#8 //loads the source |
| umlsl v26.8h, v17.8b, v3.8b |
| |
| add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd |
| csel x1, x20, x1,le |
| umull v24.8h, v16.8b, v1.8b |
| ld1 {v5.8b},[x6],x2 //loads pu1_src |
| umlsl v24.8h, v7.8b, v0.8b |
| subs x12,x12,#4 |
| ld1 {v6.8b},[x6],x2 //load and increment |
| umlal v24.8h, v17.8b, v2.8b |
| ld1 {v7.8b},[x6],x2 //load and increment |
| umlsl v24.8h, v18.8b, v3.8b |
| sub x20,x2,x2,lsl #3 |
| neg x11, x20 |
| add x14,x2,x2,lsl #1 |
| add x14,x14,x11 |
| st1 { v28.16b},[x7],x3 //stores the loaded value |
| |
| ble epilog //jumps to epilog |
| |
| kernel_8: |
| |
| umull v30.8h, v5.8b, v1.8b //mul with coeff 1 |
| subs x5,x5,#8 //2wd - 8 |
| umlsl v30.8h, v4.8b, v0.8b |
| add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd |
| csel x0, x20, x0,le |
| umlal v30.8h, v6.8b, v2.8b |
| |
| lsl x20,x2,#3 |
| sub x20,x20,x2 |
| csel x11,x20,x11,le |
| //rsble x11,x2,x2,lsl #3 |
| umlsl v30.8h, v7.8b, v3.8b |
| st1 { v26.16b},[x7],x3 //stores the loaded value |
| |
| ld1 {v16.8b},[x6],x2 //load and increment |
| |
| umull v28.8h, v6.8b, v1.8b //mul_res 2 |
| bic x20,x10,#7 //x5 ->wd |
| csel x5, x20, x5,le |
| umlsl v28.8h, v5.8b, v0.8b |
| st1 { v24.16b},[x7],x3 //stores the loaded value |
| |
| umlal v28.8h, v7.8b, v2.8b |
| ld1 {v17.8b},[x6],x2 |
| |
| umlsl v28.8h, v16.8b, v3.8b |
| ld1 {v18.8b},[x6],x2 |
| add x7,x1,x3 //pu1_dst |
| umull v26.8h, v7.8b, v1.8b |
| add x6,x0,x2 //pu1_src + src_strd |
| add x20,x0, x11 |
| prfm PLDL1KEEP,[x20] |
| |
| umlsl v26.8h, v6.8b, v0.8b |
| ld1 {v4.8b},[x0],#8 //loads the source |
| |
| add x11,x11,x2 |
| umlal v26.8h, v16.8b, v2.8b |
| st1 { v30.16b},[x1],#16 //stores the loaded value |
| |
| umlsl v26.8h, v17.8b, v3.8b |
| ld1 {v5.8b},[x6],x2 //loads pu1_src |
| |
| umull v24.8h, v16.8b, v1.8b |
| ld1 {v6.8b},[x6],x2 //load and increment |
| add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd |
| csel x1, x20, x1,le |
| |
| cmp x11,x14 |
| |
| lsl x20,x2,#3 |
| sub x20,x20,x2 |
| csel x11,x20,x11,gt |
| //rsbgt x11,x2,x2,lsl #3 |
| |
| umlsl v24.8h, v7.8b, v0.8b |
| subs x12,x12,#4 |
| |
| |
| umlal v24.8h, v17.8b, v2.8b |
| ld1 {v7.8b},[x6],x2 //load and increment |
| |
| umlsl v24.8h, v18.8b, v3.8b |
| st1 { v28.16b},[x7],x3 //stores the loaded value |
| |
| bgt kernel_8 //jumps to kernel_8 |
| |
| epilog: |
| |
| umull v30.8h, v5.8b, v1.8b //mul with coeff 1 |
| umlsl v30.8h, v4.8b, v0.8b |
| umlal v30.8h, v6.8b, v2.8b |
| umlsl v30.8h, v7.8b, v3.8b |
| st1 { v26.16b},[x7],x3 //stores the loaded value |
| |
| ld1 {v16.8b},[x6],x2 //load and increment |
| umull v28.8h, v6.8b, v1.8b //mul_res 2 |
| umlsl v28.8h, v5.8b, v0.8b |
| umlal v28.8h, v7.8b, v2.8b |
| umlsl v28.8h, v16.8b, v3.8b |
| st1 { v24.16b},[x7],x3 //stores the loaded value |
| |
| ld1 {v17.8b},[x6],x2 |
| umull v26.8h, v7.8b, v1.8b |
| add x7,x1,x3 //pu1_dst |
| umlsl v26.8h, v6.8b, v0.8b |
| st1 { v30.16b},[x1],#16 //stores the loaded value |
| umlal v26.8h, v16.8b, v2.8b |
| ld1 {v18.8b},[x6],x2 |
| umlsl v26.8h, v17.8b, v3.8b |
| |
| umull v24.8h, v16.8b, v1.8b |
| st1 { v28.16b},[x7],x3 //stores the loaded value |
| umlsl v24.8h, v7.8b, v0.8b |
| umlal v24.8h, v17.8b, v2.8b |
| st1 { v26.16b},[x7],x3 //stores the loaded value |
| umlsl v24.8h, v18.8b, v3.8b |
| |
| st1 { v24.16b},[x7],x3 //stores the loaded value |
| |
| end_loops: |
| // ldmfd sp!,{x4-x12,x15} //reload the registers from sp |
| ldp x19, x20,[sp],#16 |
| |
| ret |
| |
| |
| |