blob: dccbb2ba8afdb8e07adf6628d8fb3700e660b13b [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///**
//*******************************************************************************
//*
//* //brief
//* interprediction luma function for copy
//*
//* //par description:
//* copies the array of width 'wd' and height 'ht' from the location pointed
//* by 'src' to the location pointed by 'dst'
//*
//* //param[in] pu1_src
//* uword8 pointer to the source
//*
//* //param[out] pu1_dst
//* uword8 pointer to the destination
//*
//* //param[in] src_strd
//* integer source stride
//*
//* //param[in] dst_strd
//* integer destination stride
//*
//* //param[in] pi1_coeff
//* word8 pointer to the filter coefficients
//*
//* //param[in] ht
//* integer height of the array
//*
//* //param[in] wd
//* integer width of the array
//*
//* //returns
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_luma_copy (
// uword8 *pu1_src,
// uword8 *pu1_dst,
// word32 src_strd,
// word32 dst_strd,
// word8 *pi1_coeff,
// word32 ht,
// word32 wd )
//**************variables vs registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x11 => ht
// x16 => wd
.text
.align 4
.include "ihevc_neon_macros.s"
.globl ihevc_inter_pred_luma_copy_av8
.type ihevc_inter_pred_luma_copy_av8, %function
ihevc_inter_pred_luma_copy_av8:
// stmfd sp!, {x8-x16, lr} //stack stores the values of the arguments
stp x19,x20,[sp, #-16]!
mov x16,x6 //loads wd
mov x11,x5 //loads ht
cmp x11,#0 //checks ht == 0
ble end_loops
tst x16,#15 //checks wd for multiples for 4 & 8
beq core_loop_wd_16
tst x16,#7 //checks wd for multiples for 4 & 8
beq core_loop_wd_8
sub x15,x16,#4
outer_loop_wd_4:
subs x8,x16,#0 //checks wd == 0
ble end_inner_loop_wd_4
inner_loop_wd_4:
ld1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x9,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x3 //pu1_dst_tmp += dst_strd
st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x0,x0,#4 //pu1_src += 4
st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
subs x8,x8,#4 //(wd -4)
st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x1,x1,#4 //pu1_dst += 4
st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs x11,x11,#4 //ht - 4
sub x0,x9,x15 //pu1_src = pu1_src_tmp
sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_4
end_loops:
// ldmfd sp!,{x8-x16,pc} //reload the registers from sp
// MRS x20,PMCCFILTR_EL0
sub x0,x20,x19
ldp x19,x20,[sp],#16
ret
core_loop_wd_8:
sub x15,x16,#8
outer_loop_wd_8:
subs x8,x16,#0 //checks wd
ble end_inner_loop_wd_8
inner_loop_wd_8:
add x9,x0,x2 //pu1_src_tmp += src_strd
ld1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
add x10,x1,x3 //pu1_dst_tmp += dst_strd
st1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v1.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v1.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
subs x8,x8,#8 //wd - 8(loop condition)
ld1 {v2.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v2.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v3.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v3.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_8
end_inner_loop_wd_8:
subs x11,x11,#4 //ht -= 4
sub x0,x9,x15 //pu1_src = pu1_src_tmp
sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
// ldmfd sp!,{x8-x16,pc} //reload the registers from sp
// MRS x20,PMCCFILTR_EL0
sub x0,x20,x19
ldp x19,x20,[sp],#16
ret
core_loop_wd_16:
sub x15,x16,#16
outer_loop_wd_16:
subs x8,x16,#0 //checks wd
ble end_inner_loop_wd_16
inner_loop_wd_16:
add x9,x0,x2 //pu1_src_tmp += src_strd
ld1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
add x10,x1,x3 //pu1_dst_tmp += dst_strd
st1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v1.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v1.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
subs x8,x8,#16 //wd - 8(loop condition)
ld1 {v2.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v2.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v3.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
st1 {v3.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_16
end_inner_loop_wd_16:
subs x11,x11,#4 //ht -= 4
sub x0,x9,x15 //pu1_src = pu1_src_tmp
sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
// ldmfd sp!,{x8-x16,pc} //reload the registers from sp
// MRS x20,PMCCFILTR_EL0
sub x0,x20,x19
ldp x19,x20,[sp],#16
ret