blob: 180e5f5a82d285dddd3bd045ce203c4d19a7e9f8 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* //file
//* ihevc_inter_pred_chroma_copy_w16out_neon.s
//*
//* //brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* //author
//* yogeswaran rs
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* //brief
//* chroma interprediction filter for copy
//*
//* //par description:
//* copies the array of width 'wd' and height 'ht' from the location pointed
//* by 'src' to the location pointed by 'dst'
//*
//* //param[in] pu1_src
//* uword8 pointer to the source
//*
//* //param[out] pu1_dst
//* uword8 pointer to the destination
//*
//* //param[in] src_strd
//* integer source stride
//*
//* //param[in] dst_strd
//* integer destination stride
//*
//* //param[in] pi1_coeff
//* word8 pointer to the filter coefficients
//*
//* //param[in] ht
//* integer height of the array
//*
//* //param[in] wd
//* integer width of the array
//*
//* //returns
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
// word16 *pi2_dst,
// word32 src_strd,
// word32 dst_strd,
// word8 *pi1_coeff,
// word32 ht,
// word32 wd)
//**************variables vs registers*****************************************
//x0 => *pu1_src
//x1 => *pi2_dst
//x2 => src_strd
//x3 => dst_strd
//x4 => *pi1_coeff
//x5 => ht
//x6 => wd
.text
.align 4
.include "ihevc_neon_macros.s"
.globl ihevc_inter_pred_chroma_copy_w16out_av8
.type ihevc_inter_pred_chroma_copy_w16out_av8, %function
ihevc_inter_pred_chroma_copy_w16out_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
mov x16,x5 // ht
mov x17,x6 // wd
mov x12,x17 //loads wd
lsl x12,x12,#1 //2*wd
mov x7,x16 //loads ht
cmp x7,#0 //ht condition(ht == 0)
ble end_loops //loop
and x8,x7,#3 //check ht for mul of 2
sub x9,x7,x8 //check the rounded height value
and x11,x7,#6
cmp x11,#6
beq loop_ht_6
tst x12,#7 //conditional check for wd (multiples)
beq core_loop_wd_8
loop_ht_6:
sub x11,x12,#4
lsl x6, x3,#1
adds x6, x6,#0
cmp x9,#0
beq outer_loop_wd_4_ht_2
outer_loop_wd_4:
subs x4,x12,#0 //wd conditional subtract
ble end_inner_loop_wd_4
inner_loop_wd_4:
ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp)
add x5,x0,x2 //pu1_src +src_strd
uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
add x10,x1,x6
subs x4,x4,#4 //wd - 4
shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6)
ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
add x0,x0,#4 //pu1_src += 4
st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
add x1,x1,#8
uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6)
uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6)
ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6)
st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs x9,x9,#4 //ht - 4
sub x0,x5,x11
sub x1,x10,x11,lsl #1
bgt outer_loop_wd_4
cmp x8,#0
bgt outer_loop_wd_4_ht_2
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
ret
outer_loop_wd_4_ht_2:
subs x4,x12,#0 //wd conditional subtract
ble end_inner_loop_wd_4
inner_loop_wd_4_ht_2:
ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp)
add x5,x0,x2 //pu1_src +src_strd
uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
add x10,x1,x6
subs x4,x4,#4 //wd - 4
shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6)
ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
add x0,x0,#4 //pu1_src += 4
st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
add x1,x1,#8
uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6)
uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
bgt inner_loop_wd_4_ht_2
b end_loops
core_loop_wd_8:
//sub x11,x12,#8
lsl x5, x3,#1
adds x5, x5,#0
sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width
neg x11, x20
sub x20,x12,x2,lsl #2 //x2->src_strd
neg x8, x20
lsr x4, x12, #3 // divide by 8
mov x7,x9
mul x7, x7, x4
sub x4,x12,#0 //wd conditional check
sub x7,x7,#4 //subtract one for epilog
cmp x9,#0
beq core_loop_wd_8_ht_2
prolog:
add x6,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x5
ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
add x20,x0,x8
csel x0, x20, x0,le
add x6,x0,x2 //pu1_src_tmp += src_strd
ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
add x20,x1,x11,lsl #1
csel x1, x20, x1,le
sub x20,x12,#0 //wd conditional check
csel x4, x20, x4,le
subs x7,x7,#4 //ht - 4
blt epilog_end //jumps to epilog_end
beq epilog //jumps to epilog
outer_loop_wd_8:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x4,x4,#8 //wd decrements by 8
add x20,x0,x8
csel x0, x20, x0,le
add x6,x0,x2 //pu1_src_tmp += src_strd
ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
add x10,x1,x5
shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
add x20,x1,x11,lsl #1
csel x1, x20, x1,le
sub x20,x12,#0 //wd conditional check
csel x4, x20, x4,le
subs x7,x7,#4 //ht - 4
bgt outer_loop_wd_8
epilog:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
//add x6,x0,x2 //pu1_src_tmp += src_strd
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
add x10,x1,x5
shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
epilog_end:
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
b end_loops
core_loop_wd_8_ht_2:
add x6,x0,x2 //pu1_src_tmp += src_strd
add x10,x1,x5
ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
subs x12,x12,#8 //wd decrements by 8
shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
bgt core_loop_wd_8_ht_2
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
ret