blob: 4cc60854eb6342cce059b67fd7a989495b87eb08 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//* ihevcd_fmt_conv_420sp_to_420p.s
//*
//* //brief
//* contains function definitions for format conversions
//*
//* //author
//* ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************/
.text
.include "ihevc_neon_macros.s"
///*****************************************************************************
//* *
//* Function Name : neon_copy_yuv420sp_to_yuv420p() *
//* *
//* Description : This function conversts the image from YUV420sP color *
//* space to 420SP color space(UV interleaved). *
//* *
//* Arguments : x0 pu1_src_y *
//* x1 pu1_src_uv *
//* x2 pu1_dest_y *
//* x3 pu1_dest_u *
//* [x13 #40] pu1_dest_v *
//* [x13 #44] u2_width *
//* [x13 #48] u2_height *
//* [x13 #52] u2_stridey *
//* [x13 #56] u2_strideuv *
//* [x13 #60] u2_dest_stridey *
//* [x13 #64] u2_dest_strideuv *
//* [x13 #68] is_u_first *
//* [x13 #72] disable_luma_copy *
//* *
//* Values Returned : None *
//* *
//* Register Usage : x0 - x14 *
//* *
//* Stack Usage : 40 Bytes *
//* *
//* Interruptibility : Interruptible *
//* *
//* Known Limitations *
//* Assumptions: Image Width: Assumed to be multiple of 2 and *
//* Image Height: Assumed to be even. *
//* *
//* Revision History : *
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
//* 16 05 2012 Naveen SR draft *
//* *
//*****************************************************************************/
.globl ihevcd_fmt_conv_420sp_to_420p_av8
.type ihevcd_fmt_conv_420sp_to_420p_av8, %function
ihevcd_fmt_conv_420sp_to_420p_av8:
// STMFD sp!,{x4-x12, x14}
push_v_regs
stp x19, x20,[sp,#-16]!
mov x15, x4
mov x8, x5 ////Load u2_width
mov x9, x6 ////Load u2_height
LDR w5, [sp,#88] ////Load u2_dest_stridey
sxtw x5,w5
// LDR x6,[sp,#80] @//Load u2_strideuv
SUB x10,x7,x8 //// Src Y increment
SUB x11,x5,x8 //// Dst Y increment
LDR w5, [sp,#112] ////Load disable_luma_copy flag
sxtw x5,w5
CMP x5,#0 ////skip luma if disable_luma_copy is non-zero
BNE uv_copy_start
///* Copy Y */
MOV x4,x9 //// Copying height
y_row_loop:
MOV x6,x8 //// Copying width
y_col_loop:
SUB x6,x6,#16
ld1 {v0.8b, v1.8b},[x0],#16
st1 {v0.8b, v1.8b},[x2],#16
CMP x6,#16
BGE y_col_loop
CMP x6,#0
BEQ y_col_loop_end
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20,x6,#16
neg x6, x20
SUB x0,x0,x6
SUB x2,x2,x6
ld1 {v0.8b, v1.8b}, [x0],#16
st1 {v0.8b, v1.8b}, [x2],#16
y_col_loop_end:
ADD x0, x0, x10
ADD x2, x2, x11
SUBS x4, x4, #1
BGT y_row_loop
///* Copy UV */
uv_copy_start:
LDR w5, [sp,#96] ////Load u2_dest_strideuv
sxtw x5,w5
LDR w7, [sp,#80] ////Load u2_strideuv
sxtw x7,w7
LSR x9, x9, #1 //// height/2
// MOV x8,x8,LSR #1 @// Width/2
SUB x10,x7,x8 //// Src UV increment
LSR x11, x8, #1
SUB x11,x5,x11 //// Dst U and V increment
mov x5, x15 ////Load pu1_dest_v
LDR w4, [sp,#104] ////Load is_u_first_flag
sxtw x4,w4
CMP x4,#0 ////Swap U and V dest if is_u_first_flag is zero
csel x4, x5, x4,EQ
csel x5, x3, x5,EQ
csel x3, x4, x3,EQ
MOV x4,x9 //// Copying height
uv_row_loop:
MOV x6,x8 //// Copying width
uv_col_loop:
SUB x6,x6,#16
prfm PLDL1KEEP,[x1,#128]
ld2 {v0.8b, v1.8b},[x1],#16
ST1 {v0.8b},[x3],#8
ST1 {v1.8b},[x5],#8
CMP x6,#16
BGE uv_col_loop
CMP x6,#0
BEQ uv_col_loop_end
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20,x6,#16
neg x6, x20
SUB x1,x1,x6
SUB x3,x3,x6,LSR #1
SUB x5,x5,x6,LSR #1
ld2 {v0.8b, v1.8b}, [x1],#16
ST1 {v0.8b},[x3],#8
ST1 {v1.8b},[x5],#8
uv_col_loop_end:
ADD x1, x1, x10
ADD x3, x3, x11
ADD x5, x5, x11
SUBS x4, x4, #1
BGT uv_row_loop
exit:
// LDMFD sp!,{x4-x12, pc}
ldp x19, x20,[sp],#16
pop_v_regs
ret