blob: 007df30a4c5c80594ab73a3e09d5d201d745ead9 [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
///**
//*******************************************************************************
//*
//* @brief
//* Interprediction luma function for copy
//*
//* @par Description:
//* Copies the array of width 'wd' and height 'ht' from the location pointed
//* by 'src' to the location pointed by 'dst'
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ih264_inter_pred_luma_copy (
// UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ht,
// WORD32 wd )
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// w2 => src_strd
// w3 => dst_strd
// w4 => ht
// w5 => wd
.text
.p2align 2
.include "ih264_neon_macros.s"
.global ih264_inter_pred_luma_copy_av8
ih264_inter_pred_luma_copy_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
sxtw x2, w2
sxtw x3, w3
sxtw x4, w4
sxtw x5, w5
mov x12, x5
mov x7, x4
cmp x7, #0 //checks ht == 0
ble end_loops
tst x12, #15 //checks wd for multiples for 4 & 8
beq core_loop_wd_16
tst x12, #7 //checks wd for multiples for 4 & 8
beq core_loop_wd_8
sub x11, x12, #4
outer_loop_wd_4:
subs x4, x12, #0 //checks wd == 0
ble end_inner_loop_wd_4
inner_loop_wd_4:
ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x5, x0, x2 //pu1_src_tmp += src_strd
add x6, x1, x3 //pu1_dst_tmp += dst_strd
st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x0, x0, #4 //pu1_src += 4
st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
subs x4, x4, #4 //(wd -4)
st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
add x1, x1, #4 //pu1_dst += 4
st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
bgt inner_loop_wd_4
end_inner_loop_wd_4:
subs x7, x7, #4 //ht - 4
sub x0, x5, x11 //pu1_src = pu1_src_tmp
sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_4
end_loops:
// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
core_loop_wd_8:
sub x11, x12, #8
outer_loop_wd_8:
subs x4, x12, #0 //checks wd
ble end_inner_loop_wd_8
inner_loop_wd_8:
add x5, x0, x2 //pu1_src_tmp += src_strd
ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp)
add x6, x1, x3 //pu1_dst_tmp += dst_strd
st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
subs x4, x4, #8 //wd - 8(Loop condition)
ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_8
end_inner_loop_wd_8:
subs x7, x7, #4 //ht -= 4
sub x0, x5, x11 //pu1_src = pu1_src_tmp
sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_8
// LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
core_loop_wd_16:
sub x11, x12, #16
outer_loop_wd_16:
subs x4, x12, #0 //checks wd
ble end_inner_loop_wd_16
inner_loop_wd_16:
add x5, x0, x2 //pu1_src_tmp += src_strd
ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp)
add x6, x1, x3 //pu1_dst_tmp += dst_strd
st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
subs x4, x4, #16 //wd - 8(Loop condition)
ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp)
st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src)
bgt inner_loop_wd_16
end_inner_loop_wd_16:
subs x7, x7, #4 //ht -= 4
sub x0, x5, x11 //pu1_src = pu1_src_tmp
sub x1, x6, x11 //pu1_dst = pu1_dst_tmp
bgt outer_loop_wd_16
ldp x19, x20, [sp], #16
pop_v_regs
ret
// /*
// ********************************************************************************
// *
// * @brief This function copies a 4x4 block to destination
// *
// * @par Description:
// * Copies a 4x4 block to destination, where both src and dst are interleaved
// *
// * @param[in] pi2_src
// * Source
// *
// * @param[in] pu1_out
// * Output pointer
// *
// * @param[in] pred_strd,
// * Prediction buffer stride
// *
// * @param[in] out_strd
// * output buffer buffer Stride
// *
// * @returns none
// *
// * @remarks none
// * Currently wd and height is not used, ie a 4x4 block is always copied
// *
// *******************************************************************************
// */
// void ih264_interleave_copy(WORD16 *pi2_src,
// UWORD8 *pu1_out,
// WORD32 pred_strd,
// WORD32 out_strd
// WORD32 wd
// WORD32 ht)
// Register Usage
// x0 : pi2_src
// x1 : pu1_out
// w2 : src_strd
// w3 : out_strd
// Neon registers d0-d7, d16-d30 are used
// No need for pushing arm and neon registers
.global ih264_interleave_copy_av8
ih264_interleave_copy_av8:
push_v_regs
sxtw x2, w2
sxtw x3, w3
ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3
ld1 {v3.8b}, [x0], x2
mov v2.d[1], v3.d[0]
ld1 {v4.8b}, [x0], x2
ld1 {v5.8b}, [x0], x2
mov v4.d[1], v5.d[0]
mov x0, x1
ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs
ld1 {v19.8b}, [x1], x3
mov v18.d[1], v19.d[0]
movi v30.8h, #0x00ff
ld1 {v20.8b}, [x1], x3
ld1 {v21.8b}, [x1], x3
mov v20.d[1], v21.d[0]
bit v18.16b, v2.16b , v30.16b
bit v20.16b, v4.16b , v30.16b
st1 {v18.8b}, [x0], x3 //store out
st1 {v18.d}[1], [x0], x3
st1 {v20.8b}, [x0], x3
st1 {v20.d}[1], [x0], x3
pop_v_regs
ret