blob: 7ac685545248c767c95bfb9fd82c5528e39c17d7 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//* ihevc_inter_pred_chroma_copy.s
//*
//* @brief
//* Contains function definitions for inter prediction interpolation.
//* Functions are coded using NEON intrinsics and can be compiled using ARM
//* RVCT
//*
//* @author
//* Yogeswaran RS
//*
//* @par List of Functions:
//*
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//* Chroma interprediction filter for copy
//*
//* @par Description:
//* Copies the array of width 'wd' and height 'ht' from the location pointed
//* by 'src' to the location pointed by 'dst'
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] pi1_coeff
//* WORD8 pointer to the filter coefficients
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD8 *pi1_coeff,
// WORD32 ht,
// WORD32 wd)
//**************Variables Vs Registers*****************************************
//x0 => *pu1_src
//x1 => *pu1_dst
//x2 => src_strd
//x3 => dst_strd
//x4 => *pi1_coeff
//x5 => ht
//x6 => wd
.text
.align 4
.globl ihevc_inter_pred_chroma_copy_av8
.type ihevc_inter_pred_chroma_copy_av8, %function
ihevc_inter_pred_chroma_copy_av8:
LSL x12,x6,#1 //wd << 1
CMP x5,#0 //checks ht == 0
BLE END_LOOPS
AND x8,x5,#3 //check ht for mul of 2
SUB x5,x5,x8 //check the rounded height value
TST x12,#15 //checks wd for multiples for 16
BEQ CORE_LOOP_WD_16
TST x12,#7 //checks wd for multiples for 4 & 8
BEQ CORE_LOOP_WD_8
SUB x11,x12,#4
CMP x5,#0
BEQ OUTER_LOOP_WD_4_HT_2
OUTER_LOOP_WD_4:
SUBS x4,x12,#0 //checks wd == 0
BLE END_INNER_LOOP_WD_4
INNER_LOOP_WD_4:
LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
ADD x7,x0,x2 //pu1_src_tmp += src_strd
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
ADD x0,x0,#4 //pu1_src += 4
ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
SUBS x4,x4,#4 //(wd -4)
ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
ADD x1,x1,#4 //pu1_dst += 4
ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
BGT INNER_LOOP_WD_4
END_INNER_LOOP_WD_4:
SUBS x5,x5,#4 //ht - 4
SUB x0,x7,x11 //pu1_src = pu1_src_tmp
SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
BGT OUTER_LOOP_WD_4
CMP x8,#0
BGT OUTER_LOOP_WD_4_HT_2
END_LOOPS:
RET
OUTER_LOOP_WD_4_HT_2:
SUBS x4,x12,#0 //checks wd == 0
BLE END_LOOPS
INNER_LOOP_WD_4_HT_2:
LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
ADD x7,x0,x2 //pu1_src_tmp += src_strd
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
ADD x0,x0,#4 //pu1_src += 4
ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
SUBS x4,x4,#4 //(wd -4)
ADD x1,x1,#4 //pu1_dst += 4
BGT INNER_LOOP_WD_4_HT_2
B END_LOOPS
CORE_LOOP_WD_8:
SUB x11,x12,#8
CMP x5,#0
BEQ OUTER_LOOP_WD_8_HT_2
OUTER_LOOP_WD_8:
SUBS x4,x12,#0 //checks wd
BLE END_INNER_LOOP_WD_8
INNER_LOOP_WD_8:
ADD x7,x0,x2 //pu1_src_tmp += src_strd
LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
SUBS x4,x4,#8 //wd - 8(Loop condition)
LD1 {v2.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v2.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v3.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v3.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
BGT INNER_LOOP_WD_8
END_INNER_LOOP_WD_8:
SUBS x5,x5,#4 //ht -= 4
SUB x0,x7,x11 //pu1_src = pu1_src_tmp
SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
BGT OUTER_LOOP_WD_8
CMP x8,#0
BGT OUTER_LOOP_WD_8_HT_2
B END_LOOPS
OUTER_LOOP_WD_8_HT_2:
SUBS x4,x12,#0 //checks wd
BLE END_LOOPS
INNER_LOOP_WD_8_HT_2:
ADD x7,x0,x2 //pu1_src_tmp += src_strd
LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
B END_LOOPS
CORE_LOOP_WD_16:
SUB x11,x12,#16
CMP x5,#0
BEQ OUTER_LOOP_WD_16_HT_2
OUTER_LOOP_WD_16:
SUBS x4,x12,#0 //checks wd
BLE END_INNER_LOOP_WD_16
INNER_LOOP_WD_16:
ADD x7,x0,x2 //pu1_src_tmp += src_strd
LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
SUBS x4,x4,#16 //wd - 16(Loop condition)
LD1 {v2.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v2.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v3.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v3.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
BGT INNER_LOOP_WD_16
END_INNER_LOOP_WD_16:
SUBS x5,x5,#4 //ht -= 4
SUB x0,x7,x11 //pu1_src = pu1_src_tmp
SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
BGT OUTER_LOOP_WD_16
CMP x8,#0
BGT OUTER_LOOP_WD_16_HT_2
B END_LOOPS
OUTER_LOOP_WD_16_HT_2:
SUBS x4,x12,#0 //checks wd
BLE END_LOOPS
INNER_LOOP_WD_16_HT_2:
ADD x7,x0,x2 //pu1_src_tmp += src_strd
LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
RET