blob: 9564f99cc9d50670d423d24a4f2b198656d9387e [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
//******************************************************************************
//* @file
//* ih264_inter_pred_luma_vert_av8.s
//*
//* @brief
//* Contains function definitions for inter prediction interpolation.
//*
//* @author
//* Ittiam
//*
//* @par List of Functions:
//*
//* - ih264_inter_pred_luma_vert_av8()
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
///* All the functions here are replicated from ih264_inter_pred_filters.c
//
///**
///**
///**
// *******************************************************************************
// *
// * @brief
// * Interprediction luma filter for vertical input
// *
// * @par Description:
// * Applies a 6 tap vertcal filter.The output is clipped to 8 bits
// * sec 8.4.2.2.1 titled "Luma sample interpolation process"
// *
// * @param[in] pu1_src
// * UWORD8 pointer to the source
// *
// * @param[out] pu1_dst
// * UWORD8 pointer to the destination
// *
// * @param[in] src_strd
// * integer source stride
// *
// * @param[in] dst_strd
// * integer destination stride
// *
// * @param[in] ht
// * integer height of the array
// *
// * @param[in] wd
// * integer width of the array
// *
// * @returns
// *
// * @remarks
// * None
// *
// *******************************************************************************
//void ih264_inter_pred_luma_vert (
// UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ht,
// WORD32 wd )
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ht
// x5 => wd
.text
.p2align 2
.include "ih264_neon_macros.s"
.global ih264_inter_pred_luma_vert_av8
ih264_inter_pred_luma_vert_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
sub x14, x4, #16
movi v22.8h, #20 // Filter coeff 0x14 into Q11
subs x12, x5, #8 //if wd=8 branch to loop_8
movi v24.8h, #5 // Filter coeff 0x4 into Q12
beq loop_8_start
subs x12, x5, #4 //if wd=4 branch to loop_4
beq loop_4_start
ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0]
ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0]
ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0]
ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0]
add x14, x14, #1 //for checking loop
ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0]
uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
loop_16: //when wd=16
uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0]
uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0]
mla v14.8h, v12.8h, v22.8h // temp += temp1 * 20
uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8]
uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8]
mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
ld1 {v0.2s, v1.2s}, [x0], x2
uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8]
uaddl v12.8h, v6.8b, v8.8b
mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v16.8h, v2.8b, v0.8b
uaddl v18.8h, v4.8b, v10.8b
mla v16.8h, v12.8h , v22.8h
mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
uaddl v26.8h, v5.8b, v11.8b
uaddl v12.8h, v7.8b, v9.8b
sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
uaddl v14.8h, v3.8b, v1.8b
ld1 {v2.2s, v3.2s}, [x0], x2
mla v14.8h, v12.8h , v22.8h
mls v16.8h, v18.8h , v24.8h
sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
uaddl v18.8h, v4.8b, v2.8b
uaddl v12.8h, v8.8b, v10.8b
st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
mla v18.8h, v12.8h , v22.8h
uaddl v20.8h, v6.8b, v0.8b
mls v14.8h, v26.8h , v24.8h
sqrshrun v30.8b, v16.8h, #5
uaddl v12.8h, v9.8b, v11.8b
uaddl v16.8h, v5.8b, v3.8b
uaddl v26.8h, v7.8b, v1.8b
mla v16.8h, v12.8h , v22.8h
mls v18.8h, v20.8h , v24.8h
ld1 {v4.2s, v5.2s}, [x0], x2
sqrshrun v31.8b, v14.8h, #5
uaddl v12.8h, v10.8b, v0.8b
uaddl v14.8h, v6.8b, v4.8b
uaddl v20.8h, v8.8b, v2.8b
mla v14.8h, v12.8h , v22.8h
mls v16.8h, v26.8h , v24.8h
st1 {v30.2s, v31.2s}, [x1], x3 //store row 1
sqrshrun v30.8b, v18.8h, #5
uaddl v18.8h, v7.8b, v5.8b
uaddl v12.8h, v11.8b, v1.8b
mla v18.8h, v12.8h , v22.8h
uaddl v26.8h, v9.8b, v3.8b
mls v14.8h, v20.8h , v24.8h
ld1 {v6.2s, v7.2s}, [x0], x2
sqrshrun v31.8b, v16.8h, #5
mls v18.8h, v26.8h , v24.8h
uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0]
st1 {v30.2s, v31.2s}, [x1], x3 //store row 2
uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0]
uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8]
sqrshrun v30.8b, v14.8h, #5
uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8]
uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0]
sqrshrun v31.8b, v18.8h, #5
mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8]
st1 {v30.2s, v31.2s}, [x1], x3 //store row 3
// 4 rows processed
mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
ld1 {v8.2s, v9.2s}, [x0], x2
uaddl v12.8h, v2.8b, v4.8b
uaddl v18.8h, v3.8b, v5.8b
mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v28.8h, v9.8b, v11.8b
uaddl v16.8h, v6.8b, v0.8b
mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20
mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
uaddl v26.8h, v1.8b, v7.8b
uaddl v18.8h, v5.8b, v7.8b
sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
uaddl v14.8h, v8.8b, v10.8b
sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
ld1 {v10.2s, v11.2s}, [x0], x2
mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
st1 {v30.2s, v31.2s}, [x1], x3 // store row 4
mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
uaddl v20.8h, v11.8b, v1.8b
uaddl v26.8h, v3.8b, v9.8b
mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
uaddl v12.8h, v6.8b, v4.8b
uaddl v18.8h, v7.8b, v9.8b
sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v16.8h, v8.8b, v2.8b
sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
uaddl v14.8h, v10.8b, v0.8b
st1 {v30.2s, v31.2s}, [x1], x3 // store row 5
mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20
ld1 {v0.2s, v1.2s}, [x0], x2
uaddl v26.8h, v5.8b, v11.8b
uaddl v12.8h, v8.8b, v6.8b
uaddl v28.8h, v0.8b, v2.8b
sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20
uaddl v20.8h, v1.8b, v3.8b
mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5
mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20
uaddl v16.8h, v10.8b, v4.8b
sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
mov v2.8b, v6.8b
mov v3.8b, v7.8b
mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5
st1 {v30.2s, v31.2s}, [x1], x3 // store row 6
sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
swp v0.8b, v4.8b
swp v1.8b, v5.8b
mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5
mov v6.8b, v10.8b
mov v7.8b, v11.8b
subs x12, x14, #1 // if height==16 - looping
swp v4.8b, v8.8b
swp v5.8b, v9.8b
sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
st1 {v30.2s, v31.2s}, [x1], x3 // store row 7
bne end_func //if height =8 end function
add x14, x14, #1 //for checking loop
ld1 {v10.2s, v11.2s}, [x0], x2
uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0]
b loop_16 // looping if height =16
loop_8_start:
//// Processing row0 and row1
ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0]
ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0]
ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0]
ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0]
add x14, x14, #1 //for checking loop
ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0]
ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0]
loop_8:
//for checking loop
uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
ld1 {v6.2s}, [x0], x2
uaddl v14.8h, v3.8b, v4.8b
uaddl v16.8h, v1.8b, v6.8b
uaddl v18.8h, v2.8b, v5.8b
mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
mla v16.8h, v14.8h , v22.8h
ld1 {v7.2s}, [x0], x2
uaddl v20.8h, v4.8b, v5.8b
uaddl v12.8h, v2.8b, v7.8b
uaddl v10.8h, v3.8b, v6.8b
mls v16.8h, v18.8h , v24.8h
sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
mla v12.8h, v20.8h , v22.8h
ld1 {v0.2s}, [x0], x2
uaddl v14.8h, v5.8b, v6.8b
sqrshrun v27.8b, v16.8h, #5
uaddl v20.8h, v3.8b, v0.8b
mls v12.8h, v10.8h , v24.8h
st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0]
uaddl v18.8h, v4.8b, v7.8b
mla v20.8h, v14.8h , v22.8h
st1 {v27.2s}, [x1], x3
sqrshrun v28.8b, v12.8h, #5
st1 {v28.2s}, [x1], x3
mls v20.8h, v18.8h , v24.8h
ld1 {v1.2s}, [x0], x2
sqrshrun v29.8b, v20.8h, #5
subs x9, x4, #4
st1 {v29.2s}, [x1], x3 //store row 3
beq end_func // Branch if height==4
uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
ld1 {v2.2s}, [x0], x2
mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v8.8h, v0.8b, v7.8b
uaddl v10.8h, v1.8b, v6.8b
uaddl v12.8h, v2.8b, v5.8b
sqrshrun v26.8b, v18.8h, #5
mla v12.8h, v8.8h , v22.8h
ld1 {v3.2s}, [x0], x2
mls v12.8h, v10.8h , v24.8h
st1 {v26.2s}, [x1], x3
sqrshrun v27.8b, v12.8h, #5
st1 {v27.2s}, [x1], x3
uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
ld1 {v4.2s}, [x0], x2
mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v8.8h, v2.8b, v1.8b
uaddl v10.8h, v3.8b, v0.8b
uaddl v12.8h, v4.8b, v7.8b
sqrshrun v26.8b, v18.8h, #5
mla v12.8h, v8.8h , v22.8h
ld1 {v5.2s}, [x0], x2
mls v12.8h, v10.8h , v24.8h
st1 {v26.2s}, [x1], x3
sqrshrun v27.8b, v12.8h, #5
subs x12, x14, #1
st1 {v27.2s}, [x1], x3
add x14, x14, #1
beq loop_8 //looping if height ==16
b end_func
loop_4_start:
//// Processing row0 and row1
ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0]
ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0]
ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0]
ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0]
ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0]
ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0]
uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0]
uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20
ld1 {v6.2s}, [x0], x2
uaddl v14.8h, v3.8b, v4.8b
uaddl v16.8h, v1.8b, v6.8b
uaddl v18.8h, v2.8b, v5.8b
mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5
ld1 {v7.s}[0], [x0], x2
mla v16.8h, v14.8h , v22.8h
uaddl v20.8h, v4.8b, v5.8b
uaddl v12.8h, v2.8b, v7.8b
uaddl v10.8h, v3.8b, v6.8b
mls v16.8h, v18.8h , v24.8h
sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
mla v12.8h, v20.8h , v22.8h
ld1 {v0.s}[0], [x0], x2
uaddl v14.8h, v5.8b, v6.8b
sqrshrun v27.8b, v16.8h, #5
uaddl v20.8h, v3.8b, v0.8b
mls v12.8h, v10.8h , v24.8h
st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0]
uaddl v18.8h, v4.8b, v7.8b
mla v20.8h, v14.8h , v22.8h
st1 {v27.s}[0], [x1], x3
sqrshrun v28.8b, v12.8h, #5
st1 {v28.s}[0], [x1], x3
mls v20.8h, v18.8h , v24.8h
ld1 {v1.s}[0], [x0], x2
sqrshrun v29.8b, v20.8h, #5
st1 {v29.s}[0], [x1], x3 //store row 3
subs x9, x4, #4
beq end_func // Branch if height==4
uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0]
uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0]
uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0]
mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
ld1 {v2.s}[0], [x0], x2
mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v8.8h, v0.8b, v7.8b
uaddl v10.8h, v1.8b, v6.8b
uaddl v12.8h, v2.8b, v5.8b
sqrshrun v26.8b, v18.8h, #5
mla v12.8h, v8.8h , v22.8h
ld1 {v3.s}[0], [x0], x2
mls v12.8h, v10.8h , v24.8h
st1 {v26.s}[0], [x1], x3
sqrshrun v27.8b, v12.8h, #5
st1 {v27.s}[0], [x1], x3
uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0]
uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0]
uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0]
mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20
ld1 {v4.s}[0], [x0], x2
mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5
uaddl v8.8h, v2.8b, v1.8b
uaddl v10.8h, v3.8b, v0.8b
uaddl v12.8h, v4.8b, v7.8b
sqrshrun v26.8b, v18.8h, #5
mla v12.8h, v8.8h , v22.8h
ld1 {v5.s}[0], [x0], x2
mls v12.8h, v10.8h , v24.8h
st1 {v26.s}[0], [x1], x3
sqrshrun v27.8b, v12.8h, #5
st1 {v27.s}[0], [x1], x3
end_func:
// LDMFD sp!,{x4-x12,PC} //Restoring registers from stack
ldp x19, x20, [sp], #16
pop_v_regs
ret