blob: dd1fba431edd0186bf5f61a479431d1b200aa867 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* //file
//* ihevc_inter_pred_chroma_vert_neon.s
//*
//* //brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* //author
//* yogeswaran rs
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
///**
///**
//*******************************************************************************
//*
//* //brief
//* chroma interprediction filter for vertical input
//*
//* //par description:
//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
//* the elements pointed by 'pu1_src' and writes to the location pointed by
//* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits
//* assumptions : the function is optimized considering the fact width is
//* multiple of 2,4 or 8. and also considering height should be multiple of 2
//* width 4,8 is optimized further
//*
//* //param[in] pu1_src
//* uword8 pointer to the source
//*
//* //param[out] pu1_dst
//* uword8 pointer to the destination
//*
//* //param[in] src_strd
//* integer source stride
//*
//* //param[in] dst_strd
//* integer destination stride
//*
//* //param[in] pi1_coeff
//* word8 pointer to the filter coefficients
//*
//* //param[in] ht
//* integer height of the array
//*
//* //param[in] wd
//* integer width of the array
//*
//* //returns
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
// uword8 *pu1_dst,
// word32 src_strd,
// word32 dst_strd,
// word8 *pi1_coeff,
// word32 ht,
// word32 wd)
//**************variables vs registers*****************************************
//x0 => *pu1_src
//x1 => *pi2_dst
//x2 => src_strd
//x3 => dst_strd
.text
.align 4
.include "ihevc_neon_macros.s"
.globl ihevc_inter_pred_chroma_vert_av8
.type ihevc_inter_pred_chroma_vert_av8, %function
ihevc_inter_pred_chroma_vert_av8:
// stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
mov x15,x4 // pi1_coeff
mov x16,x5 // ht
mov x17,x6 // wd
mov x4,x16 //loads ht
mov x12,x15 //loads pi1_coeff
cmp x4,#0 //checks ht == 0
mov x6,x17 //loads wd
sub x0,x0,x2 //pu1_src - src_strd
ld1 {v0.8b},[x12] //loads pi1_coeff
ble end_loops //jumps to end
tst x6,#3 //checks (wd & 3)
abs v3.8b, v0.8b //vabs_s8(coeff)
lsl x10,x6,#1 //2*wd
dup v0.8b, v3.b[0] //coeffabs_0
dup v1.8b, v3.b[1] //coeffabs_1
dup v2.8b, v3.b[2] //coeffabs_2
dup v3.8b, v3.b[3] //coeffabs_3
bgt outer_loop_wd_2 //jumps to loop handling wd ==2
tst x4,#7 //checks ht for mul of 8
beq core_loop_ht_8 //when height is multiple of 8
lsl x7,x3,#1 //2*dst_strd
sub x9,x7,x10 //2*dst_strd - 2wd
lsl x12,x2,#1 //2*src_strd
sub x8,x12,x10 //2*src_strd - 2wd
mov x5,x10 //2wd
inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
add x6,x0,x2 //pu1_src +src_strd
ld1 {v17.8b},[x6],x2 //loads pu1_src
subs x5,x5,#8 //2wd - 8
ld1 {v5.8b},[x0],#8 //loads src
umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
ld1 {v4.8b},[x6],x2 //loads incremented src
umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
ld1 {v16.8b},[x6],x2 //loads incremented src
umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
umull v4.8h, v4.8b, v1.8b
umlsl v6.8h, v16.8b, v3.8b
umlsl v4.8h, v17.8b, v0.8b
ld1 {v18.8b},[x6] //loads the incremented src
umlal v4.8h, v16.8b, v2.8b
sqrshrun v6.8b, v6.8h,#6 //shifts right
umlsl v4.8h, v18.8b, v3.8b
add x6,x1,x3 //pu1_dst + dst_strd
sqrshrun v4.8b, v4.8h,#6 //shifts right
st1 {v6.8b},[x1],#8 //stores the loaded value
st1 {v4.8b},[x6] //stores the loaded value
bgt inner_loop_ht_2 //inner loop again
subs x4,x4,#2 //ht - 2
add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd)
mov x5,x10 //2wd
add x0,x0,x8 //pu1_src += (2*src_strd - 2wd)
bgt inner_loop_ht_2 //loop again
b end_loops //jumps to end
outer_loop_wd_2: //called when width is multiple of 2
lsl x5,x3,#1 //2*dst_strd
mov x12,x10 //2wd
sub x9,x5,x10 //2*dst_strd - 2wd
lsl x7,x2,#1 //2*src_strd
sub x8,x7,x10 //2*src_strd - 2wd
inner_loop_wd_2:
add x6,x0,x2 //pu1_src + src_strd
ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
subs x12,x12,#4 //2wd - 4
add x0,x0,#4 //pu1_src + 4
ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
dup v7.2s, v6.s[1]
ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6],x2
umlsl v4.8h, v6.8b, v0.8b
umlal v4.8h, v7.8b, v2.8b
dup v7.2s, v7.s[1]
ld1 {v7.s}[1],[x6]
add x6,x1,x3 //pu1_dst + dst_strd
umlsl v4.8h, v7.8b, v3.8b
sqrshrun v4.8b, v4.8h,#6 //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
st1 {v4.s}[0],[x1] //stores the loaded value
add x1,x1,#4 //pu1_dst += 4
st1 {v4.s}[1],[x6] //stores the loaded value
bgt inner_loop_wd_2 //inner loop again
//inner loop ends
subs x4,x4,#2 //ht - 2
add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd
mov x12,x10 //2wd
add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd
bgt inner_loop_wd_2 //loop again
b end_loops //jumps to end
core_loop_ht_8: //when wd & ht is multiple of 8
lsl x12,x3,#2 //4*dst_strd
sub x8,x12,x10 //4*dst_strd - 2wd
lsl x12,x2,#2 //4*src_strd
sub x9,x12,x10 //4*src_strd - 2wd
bic x5,x10,#7 //x5 ->wd
lsr x14, x10, #3 //divide by 8
mul x12, x4 , x14 //multiply height by width
sub x12, x12,#4 //subtract by one for epilog
prolog:
add x6,x0,x2 //pu1_src + src_strd
ld1 {v5.8b},[x6],x2 //loads pu1_src
subs x5,x5,#8 //2wd - 8
ld1 {v4.8b},[x0],#8 //loads the source
ld1 {v6.8b},[x6],x2 //load and increment
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
ld1 {v7.8b},[x6],x2 //load and increment
umlsl v30.8h, v4.8b, v0.8b
add x7,x1,x3 //pu1_dst
umlal v30.8h, v6.8b, v2.8b
umlsl v30.8h, v7.8b, v3.8b
ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
csel x0, x20, x0,le
umlsl v28.8h, v5.8b, v0.8b
bic x20,x10,#7 //x5 ->wd
csel x5, x20, x5,le
umlal v28.8h, v7.8b, v2.8b
ld1 {v17.8b},[x6],x2
umlsl v28.8h, v16.8b, v3.8b
sqrshrun v30.8b, v30.8h,#6
ld1 {v18.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
umlsl v26.8h, v6.8b, v0.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
umlal v26.8h, v16.8b, v2.8b
ld1 {v4.8b},[x0],#8 //loads the source
umlsl v26.8h, v17.8b, v3.8b
sqrshrun v28.8b, v28.8h,#6
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
umull v24.8h, v16.8b, v1.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
ld1 {v6.8b},[x6],x2 //load and increment
umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
umlsl v24.8h, v18.8b, v3.8b
lsl x11,x2,#2
st1 {v28.8b},[x7],x3 //stores the loaded value
sqrshrun v26.8b, v26.8h,#6
sub x20,x2,x2,lsl #3
neg x11, x20
add x14,x2,x2,lsl #1
add x14,x14,x11
ble epilog //jumps to epilog
kernel_8:
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
subs x5,x5,#8 //2wd - 8
umlsl v30.8h, v4.8b, v0.8b
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
csel x0, x20, x0,le
umlal v30.8h, v6.8b, v2.8b
lsl x20,x2,#3
sub x20,x20,x2
csel x11,x20,x11,le
//rsble x11,x2,x2,lsl #3
umlsl v30.8h, v7.8b, v3.8b
st1 {v26.8b},[x7],x3 //stores the loaded value
sqrshrun v24.8b, v24.8h,#6
ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
bic x20,x10,#7 //x5 ->wd
csel x5, x20, x5,le
umlsl v28.8h, v5.8b, v0.8b
st1 {v24.8b},[x7],x3 //stores the loaded value
umlal v28.8h, v7.8b, v2.8b
ld1 {v17.8b},[x6],x2
sqrshrun v30.8b, v30.8h,#6
umlsl v28.8h, v16.8b, v3.8b
ld1 {v18.8b},[x6],x2
add x7,x1,x3 //pu1_dst
umull v26.8h, v7.8b, v1.8b
add x6,x0,x2 //pu1_src + src_strd
add x20,x0, x11
prfm PLDL1KEEP,[x20]
umlsl v26.8h, v6.8b, v0.8b
ld1 {v4.8b},[x0],#8 //loads the source
umlal v26.8h, v16.8b, v2.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
umlsl v26.8h, v17.8b, v3.8b
ld1 {v5.8b},[x6],x2 //loads pu1_src
add x11,x11,x2
sqrshrun v28.8b, v28.8h,#6
umull v24.8h, v16.8b, v1.8b
ld1 {v6.8b},[x6],x2 //load and increment
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
csel x1, x20, x1,le
cmp x11,x14
lsl x20,x2,#3
sub x20,x20,x2
csel x11,x20,x11,gt
//rsbgt x11,x2,x2,lsl #3
umlsl v24.8h, v7.8b, v0.8b
subs x12,x12,#4
umlal v24.8h, v17.8b, v2.8b
ld1 {v7.8b},[x6],x2 //load and increment
umlsl v24.8h, v18.8b, v3.8b
st1 {v28.8b},[x7],x3 //stores the loaded value
sqrshrun v26.8b, v26.8h,#6
bgt kernel_8 //jumps to kernel_8
epilog:
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
umlsl v30.8h, v4.8b, v0.8b
umlal v30.8h, v6.8b, v2.8b
umlsl v30.8h, v7.8b, v3.8b
st1 {v26.8b},[x7],x3 //stores the loaded value
sqrshrun v24.8b, v24.8h,#6
ld1 {v16.8b},[x6],x2 //load and increment
umull v28.8h, v6.8b, v1.8b //mul_res 2
umlsl v28.8h, v5.8b, v0.8b
umlal v28.8h, v7.8b, v2.8b
umlsl v28.8h, v16.8b, v3.8b
st1 {v24.8b},[x7],x3 //stores the loaded value
sqrshrun v30.8b, v30.8h,#6
ld1 {v17.8b},[x6],x2
umull v26.8h, v7.8b, v1.8b
add x7,x1,x3 //pu1_dst
umlsl v26.8h, v6.8b, v0.8b
st1 {v30.8b},[x1],#8 //stores the loaded value
sqrshrun v28.8b, v28.8h,#6
umlal v26.8h, v16.8b, v2.8b
ld1 {v18.8b},[x6],x2
umlsl v26.8h, v17.8b, v3.8b
umull v24.8h, v16.8b, v1.8b
sqrshrun v26.8b, v26.8h,#6
st1 {v28.8b},[x7],x3 //stores the loaded value
umlsl v24.8h, v7.8b, v0.8b
umlal v24.8h, v17.8b, v2.8b
st1 {v26.8b},[x7],x3 //stores the loaded value
umlsl v24.8h, v18.8b, v3.8b
sqrshrun v24.8b, v24.8h,#6
st1 {v24.8b},[x7],x3 //stores the loaded value
end_loops:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
ret