| ///***************************************************************************** |
| //* |
| //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //*****************************************************************************/ |
| ///** |
| //******************************************************************************* |
| //* //file |
| //* ihevc_inter_pred_chroma_copy_w16out_neon.s |
| //* |
| //* //brief |
| //* contains function definitions for inter prediction interpolation. |
| //* functions are coded using neon intrinsics and can be compiled using |
| |
| //* rvct |
| //* |
| //* //author |
| //* yogeswaran rs |
| //* |
| //* //par list of functions: |
| //* |
| //* |
| //* //remarks |
| //* none |
| //* |
| //******************************************************************************* |
| //*/ |
| ///** |
| //******************************************************************************* |
| //* |
| //* //brief |
| //* chroma interprediction filter for copy |
| //* |
| //* //par description: |
| //* copies the array of width 'wd' and height 'ht' from the location pointed |
| //* by 'src' to the location pointed by 'dst' |
| //* |
| //* //param[in] pu1_src |
| //* uword8 pointer to the source |
| //* |
| //* //param[out] pu1_dst |
| //* uword8 pointer to the destination |
| //* |
| //* //param[in] src_strd |
| //* integer source stride |
| //* |
| //* //param[in] dst_strd |
| //* integer destination stride |
| //* |
| //* //param[in] pi1_coeff |
| //* word8 pointer to the filter coefficients |
| //* |
| //* //param[in] ht |
| //* integer height of the array |
| //* |
| //* //param[in] wd |
| //* integer width of the array |
| //* |
| //* //returns |
| //* |
| //* //remarks |
| //* none |
| //* |
| //******************************************************************************* |
| //*/ |
| |
| //void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, |
| // word16 *pi2_dst, |
| // word32 src_strd, |
| // word32 dst_strd, |
| // word8 *pi1_coeff, |
| // word32 ht, |
| // word32 wd) |
| //**************variables vs registers***************************************** |
| //x0 => *pu1_src |
| //x1 => *pi2_dst |
| //x2 => src_strd |
| //x3 => dst_strd |
| //x4 => *pi1_coeff |
| //x5 => ht |
| //x6 => wd |
| |
| .text |
| .align 4 |
| |
| .include "ihevc_neon_macros.s" |
| |
| .globl ihevc_inter_pred_chroma_copy_w16out_av8 |
| |
| .type ihevc_inter_pred_chroma_copy_w16out_av8, %function |
| |
| ihevc_inter_pred_chroma_copy_w16out_av8: |
| |
| // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments |
| |
| stp x19, x20,[sp,#-16]! |
| |
| mov x15,x4 // pi1_coeff |
| mov x16,x5 // ht |
| mov x17,x6 // wd |
| |
| |
| mov x12,x17 //loads wd |
| lsl x12,x12,#1 //2*wd |
| mov x7,x16 //loads ht |
| cmp x7,#0 //ht condition(ht == 0) |
| ble end_loops //loop |
| and x8,x7,#3 //check ht for mul of 2 |
| sub x9,x7,x8 //check the rounded height value |
| and x11,x7,#6 |
| cmp x11,#6 |
| beq loop_ht_6 |
| tst x12,#7 //conditional check for wd (multiples) |
| beq core_loop_wd_8 |
| |
| loop_ht_6: |
| sub x11,x12,#4 |
| lsl x6, x3,#1 |
| adds x6, x6,#0 |
| cmp x9,#0 |
| beq outer_loop_wd_4_ht_2 |
| |
| outer_loop_wd_4: |
| subs x4,x12,#0 //wd conditional subtract |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4: |
| ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) |
| add x5,x0,x2 //pu1_src +src_strd |
| uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| add x10,x1,x6 |
| subs x4,x4,#4 //wd - 4 |
| shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) |
| ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) |
| add x0,x0,#4 //pu1_src += 4 |
| st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| add x1,x1,#8 |
| uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) |
| shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) |
| uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) |
| ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) |
| st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) |
| st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| bgt inner_loop_wd_4 |
| |
| end_inner_loop_wd_4: |
| subs x9,x9,#4 //ht - 4 |
| sub x0,x5,x11 |
| sub x1,x10,x11,lsl #1 |
| bgt outer_loop_wd_4 |
| cmp x8,#0 |
| bgt outer_loop_wd_4_ht_2 |
| |
| |
| end_loops: |
| // ldmfd sp!,{x4-x12,x15} //reload the registers from sp |
| ldp x19, x20,[sp],#16 |
| |
| ret |
| |
| |
| outer_loop_wd_4_ht_2: |
| subs x4,x12,#0 //wd conditional subtract |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4_ht_2: |
| ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) |
| add x5,x0,x2 //pu1_src +src_strd |
| uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| add x10,x1,x6 |
| subs x4,x4,#4 //wd - 4 |
| shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) |
| ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) |
| add x0,x0,#4 //pu1_src += 4 |
| st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| add x1,x1,#8 |
| uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) |
| shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) |
| uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| bgt inner_loop_wd_4_ht_2 |
| b end_loops |
| |
| |
| core_loop_wd_8: |
| //sub x11,x12,#8 |
| lsl x5, x3,#1 |
| adds x5, x5,#0 |
| sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width |
| neg x11, x20 |
| sub x20,x12,x2,lsl #2 //x2->src_strd |
| neg x8, x20 |
| lsr x4, x12, #3 // divide by 8 |
| mov x7,x9 |
| mul x7, x7, x4 |
| sub x4,x12,#0 //wd conditional check |
| sub x7,x7,#4 //subtract one for epilog |
| cmp x9,#0 |
| beq core_loop_wd_8_ht_2 |
| |
| prolog: |
| add x6,x0,x2 //pu1_src_tmp += src_strd |
| add x10,x1,x5 |
| ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) |
| ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) |
| uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| subs x4,x4,#8 //wd decrements by 8 |
| shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) |
| add x20,x0,x8 |
| csel x0, x20, x0,le |
| add x6,x0,x2 //pu1_src_tmp += src_strd |
| ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) |
| ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| |
| st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) |
| add x20,x1,x11,lsl #1 |
| csel x1, x20, x1,le |
| sub x20,x12,#0 //wd conditional check |
| csel x4, x20, x4,le |
| |
| subs x7,x7,#4 //ht - 4 |
| |
| blt epilog_end //jumps to epilog_end |
| beq epilog //jumps to epilog |
| |
| |
| |
| outer_loop_wd_8: |
| |
| st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) |
| |
| st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| subs x4,x4,#8 //wd decrements by 8 |
| add x20,x0,x8 |
| csel x0, x20, x0,le |
| |
| add x6,x0,x2 //pu1_src_tmp += src_strd |
| |
| ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) |
| shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) |
| |
| ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) |
| |
| ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) |
| |
| ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| add x10,x1,x5 |
| |
| shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) |
| |
| st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) |
| |
| add x20,x1,x11,lsl #1 |
| csel x1, x20, x1,le |
| sub x20,x12,#0 //wd conditional check |
| csel x4, x20, x4,le |
| |
| subs x7,x7,#4 //ht - 4 |
| bgt outer_loop_wd_8 |
| |
| epilog: |
| st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) |
| |
| st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| //add x6,x0,x2 //pu1_src_tmp += src_strd |
| |
| shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) |
| add x10,x1,x5 |
| shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) |
| |
| st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) |
| epilog_end: |
| st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| b end_loops |
| |
| core_loop_wd_8_ht_2: |
| add x6,x0,x2 //pu1_src_tmp += src_strd |
| add x10,x1,x5 |
| ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) |
| ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) |
| uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) |
| uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) |
| subs x12,x12,#8 //wd decrements by 8 |
| shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) |
| shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) |
| st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) |
| st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) |
| bgt core_loop_wd_8_ht_2 |
| |
| // ldmfd sp!,{x4-x12,x15} //reload the registers from sp |
| ldp x19, x20,[sp],#16 |
| |
| ret |
| |
| |
| |
| |
| |
| |