blob: ccf47a573bb42e79f7c464ca49ebea4a5115cf8a [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//* ihevcd_fmt_conv_420sp_to_420sp.s
//*
//* //brief
//* contains function definitions for format conversions
//*
//* //author
//* ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************/
.equ DO1STROUNDING, 0
// ARM
//
// PRESERVE8
.text
.p2align 2
.include "ihevc_neon_macros.s"
///*****************************************************************************
//* *
//* Function Name : ihevcd_fmt_conv_420sp_to_420sp() *
//* *
//* Description : This function conversts the image from YUV420SP color *
//* space to 420SP color space(UV interleaved). *
//* *
//* Arguments : x0 pu1_y *
//* x1 pu1_uv *
//* x2 pu1_dest_y *
//* x3 pu1_dest_uv *
//* [x13 #40] u2_width *
//* [x13 #44] u2_height *
//* [x13 #48] u2_stridey *
//* [x13 #52] u2_stridechroma *
//* [x13 #56] u2_dest_stridey *
//* [x13 #60] u2_dest_stridechroma *
//* *
//* Values Returned : None *
//* *
//* Register Usage : x0 - x14 *
//* *
//* Stack Usage : 40 Bytes *
//* *
//* Interruptibility : Interruptible *
//* *
//* Known Limitations *
//* Assumptions: Image Width: Assumed to be multiple of 2 and *
//* Image Height: Assumed to be even. *
//* *
//* Revision History : *
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
//* 16 05 2012 Naveen SR draft *
//* *
//*****************************************************************************/
.global ihevcd_fmt_conv_420sp_to_420sp_av8
.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
ihevcd_fmt_conv_420sp_to_420sp_av8:
// STMFD sp!,{x4-x12, x14}
push_v_regs
stp x19, x20,[sp,#-16]!
mov x8, x4 ////Load u2_width
mov x9, x5 ////Load u2_height
LDR w5, [sp,#80] ////Load u2_dest_stridey
sxtw x5,w5
mov x7, x6 ////Load u2_stridey
SUB x10,x7,x8 //// Src Y increment
SUB x11,x5,x8 //// Dst Y increment
///* Copy Y */
MOV x4,x9 //// Copying height
y_row_loop:
MOV x6,x8 //// Copying width
y_col_loop:
prfm PLDL1KEEP,[x0, #128]
SUB x6,x6,#32
LD1 {v0.8b},[x0],#8
LD1 {v1.8b},[x0],#8
LD1 {v2.8b},[x0],#8
LD1 {v3.8b},[x0],#8
ST1 {v0.8b},[x2],#8
ST1 {v1.8b},[x2],#8
ST1 {v2.8b},[x2],#8
ST1 {v3.8b},[x2],#8
CMP x6,#32
BGE y_col_loop
CMP x6,#0
BEQ y_col_loop_end
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20,x6,#32
neg x6, x20
SUB x0,x0,x6
SUB x2,x2,x6
LD1 {v0.8b},[x0],#8
LD1 {v1.8b},[x0],#8
LD1 {v2.8b},[x0],#8
LD1 {v3.8b},[x0],#8
ST1 {v0.8b},[x2],#8
ST1 {v1.8b},[x2],#8
ST1 {v2.8b},[x2],#8
ST1 {v3.8b},[x2],#8
y_col_loop_end:
ADD x0, x0, x10
ADD x2, x2, x11
SUBS x4, x4, #1
BGT y_row_loop
///* Copy UV */
LDR w5, [sp,#88] ////Load u2_dest_stridechroma
sxtw x5,w5
LSR x9, x9, #1 //// height/2
// MOV x8,x8,LSR #1 @// Width/2
MOV x2,x3 //pu1_dest_uv
SUB x10,x7,x8 //// Src UV increment
SUB x11,x5,x8 //// Dst UV increment
MOV x4,x9 //// Copying height
uv_row_loop:
MOV x6,x8 //// Copying width
uv_col_loop:
prfm PLDL1KEEP,[x1, #128]
SUB x6,x6,#16
LD1 {v0.8b},[x1],#8
LD1 {v1.8b},[x1],#8
ST1 {v0.8b},[x2],#8
ST1 {v1.8b},[x2],#8
CMP x6,#16
BGE uv_col_loop
CMP x6,#0
BEQ u_col_loop_end
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20,x6,#16
neg x6, x20
SUB x1,x1,x6
SUB x2,x2,x6
LD1 {v0.8b},[x1],#8
LD1 {v1.8b},[x1],#8
ST1 {v0.8b},[x2],#8
ST1 {v1.8b},[x2],#8
u_col_loop_end:
ADD x1, x1, x10
ADD x2, x2, x11
SUBS x4, x4, #1
BGT uv_row_loop
exit:
// LDMFD sp!,{x4-x12, pc}
ldp x19, x20,[sp],#16
pop_v_regs
ret
.section .note.GNU-stack,"",%progbits