blob: 71472001037addfa7ac3ddaac3a6d8b159340bfd [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//******************************************************************************
//* //file
//* ihevc_inter_pred_filters_luma_vert_w16inp.s
//*
//* //brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* //author
//* yogeswaran rs
//*
//* //par list of functions:
//*
//* - ihevc_inter_pred_luma_vert()
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
///* include reconstruction */
//
///**
//*******************************************************************************
//*
//* //brief
//* luma vertical filter for 16bit input.
//*
//* //par description:
//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
//* the elements pointed by 'pu1_src' and writes to the location pointed by
//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
//* clipped to lie between 0 and 255 assumptions : the function is
//* optimized considering the fact width is multiple of 4. and height as
//* multiple of 2.
//*
//* //param[in] pi2_src
//* word16 pointer to the source
//*
//* //param[out] pu1_dst
//* uword8 pointer to the destination
//*
//* //param[in] src_strd
//* integer source stride
//*
//* //param[in] dst_strd
//* integer destination stride
//*
//* //param[in] pi1_coeff
//* word8 pointer to the filter coefficients
//*
//* //param[in] ht
//* integer height of the array
//*
//* //param[in] wd
//* integer width of the array
//*
//* //returns
//*
//* //remarks
//* none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
// uword8 *pu1_dst,
// word32 src_strd,
// word32 dst_strd,
// word8 *pi1_coeff,
// word32 ht,
// word32 wd )
//**************variables vs registers*****************************************
// r0 => *pu2_src
// r1 => *pu1_dst
// r2 => src_strd
// r3 => dst_strd
// r4 => *pi1_coeff
// r5 => ht
// r6 => wd
.text
.align 4
.include "ihevc_neon_macros.s"
.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
//stmfd sp!, {r4-r12, r14} //stack stores the values of the arguments
stp x19,x20,[sp, #-16]!
mov x15,x4 // pi1_coeff
mov x16,x5 // ht
mov x17,x6 // wd
mov x12,x15 //load pi1_coeff
lsl x6,x3,#1
mov x5,x17 //load wd
ld1 {v0.8b},[x12] //coeff = ld1_s8(pi1_coeff)
lsl x2, x2,#1
sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff
//vabs.s8 d0,d0 //vabs_s8(coeff)
add x0,x0,x12 //r0->pu1_src r12->pi1_coeff
mov x3,x16 //load ht
subs x7,x3,#0 //r3->ht
//ble end_loops //end loop jump
sxtl v0.8h,v0.8b
dup v22.4h,v0.h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
dup v23.4h,v0.h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
dup v24.4h,v0.h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
dup v25.4h,v0.h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
dup v26.4h,v0.h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
dup v27.4h,v0.h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
dup v28.4h,v0.h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
dup v29.4h,v0.h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
movi v30.4s,#8, lsl #16
sub x9,x5,x6,lsl #2 //r6->dst_strd r5 ->wd
neg x9,x9
sub x8,x5,x2,lsl #2 //r2->src_strd
neg x8,x8
sub x8,x8,x5
sub x9,x9,x5
lsr x3, x5, #2 //divide by 4
mul x7, x7, x3 //multiply height by width
sub x7, x7, #4 //subtract by one for epilog
mov x4,x5 //r5 ->wd
//mov r2, r2, lsl #1
prolog:
add x3,x0,x2 //pu1_src_tmp += src_strd//
ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
ld1 {v0.4h},[x0], #8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
subs x4,x4,#4
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
add x20,x0,x8,lsl #0
csel x0,x20,x0,le
smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
csel x4,x5,x4,le
smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
add x3,x0,x2 //pu1_src_tmp += src_strd//
smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
sub v19.4s, v19.4s, v30.4s
ld1 {v1.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smull v21.4s,v3.4h,v23.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smlal v21.4s,v2.4h,v22.4h
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v21.4s,v4.4h,v24.4h
smlal v21.4s,v5.4h,v25.4h
smlal v21.4s,v6.4h,v26.4h
smlal v21.4s,v7.4h,v27.4h
smlal v21.4s,v16.4h,v28.4h
smlal v21.4s,v17.4h,v29.4h
add x14,x1,x6
sub v20.4s, v20.4s, v30.4s
shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
smull v31.4s,v4.4h,v23.4h
smlal v31.4s,v3.4h,v22.4h
smlal v31.4s,v5.4h,v24.4h
smlal v31.4s,v6.4h,v25.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v7.4h,v26.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v16.4h,v27.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v17.4h,v28.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v18.4h,v29.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
sub v21.4s, v21.4s, v30.4s
shrn v20.4h, v20.4s, #6
//vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x20, x1, x9
csel x1, x20, x1, le
subs x7,x7,#4
blt epilog_end //jumps to epilog_end
beq epilog //jumps to epilog
kernel_8:
smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
subs x4,x4,#4
smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
add x20,x0,x8,lsl #0
csel x0,x20,x0,le
smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
sub v31.4S, v31.4s, v30.4s
shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
st1 {v21.2s},[x14],x6
smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
sub v19.4s, v19.4s, v30.4s
shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
smull v21.4s,v3.4h,v23.4h
csel x4,x5,x4,le
smlal v21.4s,v2.4h,v22.4h
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v21.4s,v4.4h,v24.4h
add x3,x0,x2 //pu1_src_tmp += src_strd//
smlal v21.4s,v5.4h,v25.4h
smlal v21.4s,v6.4h,v26.4h
st1 {v31.2s},[x14],x6
smlal v21.4s,v7.4h,v27.4h
ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v21.4s,v16.4h,v28.4h
add x14,x1,x6
smlal v21.4s,v17.4h,v29.4h
ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
sub v20.4s, v20.4s, v30.4s
shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smull v31.4s,v4.4h,v23.4h
smlal v31.4s,v3.4h,v22.4h
smlal v31.4s,v5.4h,v24.4h
ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v6.4h,v25.4h
ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v7.4h,v26.4h
ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v16.4h,v27.4h
ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v17.4h,v28.4h
ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
smlal v31.4s,v18.4h,v29.4h
st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
sub v21.4s, v21.4s, v30.4s
shrn v20.4h, v20.4s, #6
add x20, x1, x9
csel x1, x20, x1, le
//vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
subs x7,x7,#4
bgt kernel_8 //jumps to kernel_8
epilog:
smull v19.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
smlal v19.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
smlal v19.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
smlal v19.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
smlal v19.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
smlal v19.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
smlal v19.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
smlal v19.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
st1 {v20.2s},[x14],x6
sub v31.4s, v31.4s, v30.4s
shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
smull v20.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
smlal v20.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
smlal v20.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
smlal v20.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
smlal v20.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
smlal v20.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
smlal v20.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
smlal v20.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
st1 {v21.2s},[x14],x6
sub v19.4s, v19.4s, v30.4s
shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
smull v21.4s,v3.4h,v23.4h
smlal v21.4s,v2.4h,v22.4h
smlal v21.4s,v4.4h,v24.4h
smlal v21.4s,v5.4h,v25.4h
smlal v21.4s,v6.4h,v26.4h
smlal v21.4s,v7.4h,v27.4h
smlal v21.4s,v16.4h,v28.4h
smlal v21.4s,v17.4h,v29.4h
st1 {v31.2s},[x14],x6
sub v20.4s, v20.4s, v30.4s
shrn v19.4h, v19.4s, #6
//vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
smull v31.4s,v4.4h,v23.4h
smlal v31.4s,v3.4h,v22.4h
smlal v31.4s,v5.4h,v24.4h
smlal v31.4s,v6.4h,v25.4h
smlal v31.4s,v7.4h,v26.4h
smlal v31.4s,v16.4h,v27.4h
smlal v31.4s,v17.4h,v28.4h
smlal v31.4s,v18.4h,v29.4h
sub v21.4s, v21.4s, v30.4s
shrn v20.4h, v20.4s, #6
//vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
add x14,x1,x6
st1 {v19.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
epilog_end:
st1 {v20.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
shrn v21.4h, v21.4s, #6
//vqrshrun d12,q6,#6
st1 {v21.2s},[x14],x6
sub v31.4s, v31.4s, v30.4s
shrn v31.4h, v31.4s, #6
//vqrshrun d14,q7,#6
st1 {v31.2s},[x14],x6
end_loops:
//ldmfd sp!,{r4-r12,r15} //reload the registers from sp
ldp x19, x20,[sp], #16
ret