blob: ec76e2963b990b5b5eef66ff8a49f81b5fb95a87 [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///*
////----------------------------------------------------------------------------
//// File Name : impeg2_format_conv.s
////
//// Description : This file has the Idct Implementations for the
//// MPEG4 SP decoder on neon platform.
////
//// Reference Document :
////
//// Revision History :
//// Date Author Detail Description
//// ------------ ---------------- ----------------------------------
//// Jul 07, 2008 Naveen Kumar T Created
////
////-------------------------------------------------------------------------
//*/
///*
//// ----------------------------------------------------------------------------
//// Include Files
//// ----------------------------------------------------------------------------
//*/
.set log2_16 , 4
.set log2_2 , 1
.text
.include "impeg2_neon_macros.s"
///*
//// ----------------------------------------------------------------------------
//// Struct/Union Types and Define
//// ----------------------------------------------------------------------------
//*/
///*
//// ----------------------------------------------------------------------------
//// Static Global Data section variables
//// ----------------------------------------------------------------------------
//*/
////--------------------------- NONE --------------------------------------------
///*
//// ----------------------------------------------------------------------------
//// Static Prototype Functions
//// ----------------------------------------------------------------------------
//*/
//// -------------------------- NONE --------------------------------------------
///*
//// ----------------------------------------------------------------------------
//// Exported functions
//// ----------------------------------------------------------------------------
//*/
///*****************************************************************************
//* *
//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() *
//* *
//* Description : This function conversts the image from YUV420P color *
//* space to 420SP color space(UV interleaved). *
//* *
//* Arguments : x0 pu1_y *
//* x1 pu1_u *
//* x2 pu1_v *
//* x3 pu1_dest_y *
//* x4 pu1_dest_uv *
//* x5 u2_height *
//* x6 u2_width *
//* x7 u2_stridey *
//* sp, #80 u2_strideu *
//* sp, #88 u2_stridev *
//* sp, #96 u2_dest_stride_y *
//* sp, #104 u2_dest_stride_uv *
//* sp, #112 convert_uv_only *
//* *
//* Values Returned : None *
//* *
//* Register Usage : x8, x10, x16, x20, v0, v1 *
//* *
//* Stack Usage : 80 Bytes *
//* *
//* Interruptibility : Interruptible *
//* *
//* Known Limitations *
//* Assumptions: Image Width: Assumed to be multiple of 16 and *
//* greater than or equal to 16 *
//* Image Height: Assumed to be even. *
//* *
//* Revision History : *
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
//* 07 06 2010 Varshita Draft *
//* 07 06 2010 Naveen Kr T Completed *
//* *
//*****************************************************************************/
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
//// push the registers on the stack
// pu1_y, - x0
// pu1_u, - x1
// pu1_v, - x2
// pu1_dest_y, - x3
// pu1_dest_uv, - x4
// u2_height, - x5
// u2_width, - x6
// u2_stridey, - x7
// u2_strideu, - sp, #80
// u2_stridev, - sp, #88
// u2_dest_stride_y, - sp, #96
// u2_dest_stride_uv, - sp, #104
// convert_uv_only - sp, #112
// STMFD sp!,{x4-x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
ldr w14, [sp, #112] //// Load convert_uv_only
cmp w14, #1
beq yuv420sp_uv_chroma
///* Do the preprocessing before the main loops start */
//// Load the parameters from stack
ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
uxtw x8, w8
sub x7, x7, x6 //// Source increment
sub x8, x8, x6 //// Destination increment
yuv420sp_uv_row_loop_y:
mov x16, x6
yuv420sp_uv_col_loop_y:
prfm pldl1keep, [x0, #128]
ld1 {v0.8b, v1.8b}, [x0], #16
st1 {v0.8b, v1.8b}, [x3], #16
sub x16, x16, #16
cmp x16, #15
bgt yuv420sp_uv_col_loop_y
cmp x16, #0
beq yuv420sp_uv_row_loop__y
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20, x16, #16
neg x16, x20
sub x0, x0, x16
sub x3, x3, x16
ld1 {v0.8b, v1.8b}, [x0], #16
st1 {v0.8b, v1.8b}, [x3], #16
yuv420sp_uv_row_loop__y:
add x0, x0, x7
add x3, x3, x8
subs x5, x5, #1
bgt yuv420sp_uv_row_loop_y
yuv420sp_uv_chroma:
ldr w7, [sp, #88] //// Load u2_strideu from stack
sxtw x7, w7
ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
sxtw x8, w8
add x6, x6, 1
bic x6, x6, #1
add x9, x9, 1
sub x7, x7, x6, lsr #1 //// Source increment
sub x8, x8, x6 //// Destination increment
lsr x6, x6, #1
lsr x5, x5, #1
yuv420sp_uv_row_loop_uv:
mov x16, x6
yuv420sp_uv_col_loop_uv:
prfm pldl1keep, [x1, #128]
prfm pldl1keep, [x2, #128]
ld1 {v0.8b}, [x1], #8
ld1 {v1.8b}, [x2], #8
st2 {v0.8b, v1.8b}, [x4], #16
sub x16, x16, #8
cmp x16, #7
bgt yuv420sp_uv_col_loop_uv
cmp x16, #0
beq yuv420sp_uv_row_loop__uv
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20, x16, #8
neg x16, x20
sub x1, x1, x16
sub x2, x2, x16
sub x4, x4, x16, lsl #1
ld1 {v0.8b}, [x1], #8
ld1 {v1.8b}, [x2], #8
st2 {v0.8b, v1.8b}, [x4], #16
yuv420sp_uv_row_loop__uv:
add x1, x1, x7
add x2, x2, x7
add x4, x4, x8
subs x5, x5, #1
bgt yuv420sp_uv_row_loop_uv
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///*****************************************************************************
//* *
//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() *
//* *
//* Description : This function conversts the image from YUV420P color *
//* space to 420SP color space(VU interleaved). *
//* This function is similar to above function *
//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
//* VLD1.8 for chroma - order of registers is different *
//* *
//* Arguments : x0 pu1_y *
//* x1 pu1_u *
//* x2 pu1_v *
//* x3 pu1_dest_y *
//* x4 pu1_dest_uv *
//* x5 u2_height *
//* x6 u2_width *
//* x7 u2_stridey *
//* sp, #80 u2_strideu *
//* sp, #88 u2_stridev *
//* sp, #96 u2_dest_stride_y *
//* sp, #104 u2_dest_stride_uv *
//* sp, #112 convert_uv_only *
//* *
//* Values Returned : None *
//* *
//* Register Usage : x8, x14, x16, x20, v0, v1 *
//* *
//* Stack Usage : 80 Bytes *
//* *
//* Interruptibility : Interruptible *
//* *
//* Known Limitations *
//* Assumptions: Image Width: Assumed to be multiple of 16 and *
//* greater than or equal to 16 *
//* Image Height: Assumed to be even. *
//* *
//* Revision History : *
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
//* 07 06 2010 Varshita Draft *
//* 07 06 2010 Naveen Kr T Completed *
//* *
//*****************************************************************************/
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
//// push the registers on the stack
// pu1_y, - x0
// pu1_u, - x1
// pu1_v, - x2
// pu1_dest_y, - x3
// pu1_dest_uv, - x4
// u2_height, - x5
// u2_width, - x6
// u2_stridey, - x7
// u2_strideu, - sp, #80
// u2_stridev, - sp, #88
// u2_dest_stride_y, - sp, #96
// u2_dest_stride_uv, - sp, #104
// convert_uv_only - sp, #112
// STMFD sp!,{x4-x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
ldr w14, [sp, #112] //// Load convert_uv_only
cmp w14, #1
beq yuv420sp_vu_chroma
///* Do the preprocessing before the main loops start */
//// Load the parameters from stack
ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
uxtw x8, w8
sub x7, x7, x6 //// Source increment
sub x8, x8, x6 //// Destination increment
yuv420sp_vu_row_loop_y:
mov x16, x6
yuv420sp_vu_col_loop_y:
prfm pldl1keep, [x0, #128]
ld1 {v0.8b, v1.8b}, [x0], #16
st1 {v0.8b, v1.8b}, [x3], #16
sub x16, x16, #16
cmp x16, #15
bgt yuv420sp_vu_col_loop_y
cmp x16, #0
beq yuv420sp_vu_row_loop__y
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20, x16, #16
neg x16, x20
sub x0, x0, x16
sub x3, x3, x16
ld1 {v0.8b, v1.8b}, [x0], #16
st1 {v0.8b, v1.8b}, [x3], #16
yuv420sp_vu_row_loop__y:
add x0, x0, x7
add x3, x3, x8
subs x5, x5, #1
bgt yuv420sp_vu_row_loop_y
yuv420sp_vu_chroma:
ldr w7, [sp, #80] //// Load u2_strideu from stack
sxtw x7, w7
ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
sxtw x8, w8
add x6, x6, 1
bic x6, x6, #1
add x9, x9, 1
sub x7, x7, x6, lsr #1 //// Source increment
sub x8, x8, x6 //// Destination increment
lsr x6, x6, #1
lsr x5, x5, #1
yuv420sp_vu_row_loop_uv:
mov x16, x6
yuv420sp_vu_col_loop_uv:
prfm pldl1keep, [x1, #128]
prfm pldl1keep, [x2, #128]
ld1 {v1.8b}, [x1], #8
ld1 {v0.8b}, [x2], #8
st2 {v0.8b, v1.8b}, [x4], #16
sub x16, x16, #8
cmp x16, #7
bgt yuv420sp_vu_col_loop_uv
cmp x16, #0
beq yuv420sp_vu_row_loop__uv
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
////Ex if width is 162, above loop will process 160 pixels. And
////Both source and destination will point to 146th pixel and then 16 bytes will be read
//// and written using VLD1 and VST1
sub x20, x16, #8
neg x16, x20
sub x1, x1, x16
sub x2, x2, x16
sub x4, x4, x16, lsl #1
ld1 {v1.8b}, [x1], #8
ld1 {v0.8b}, [x2], #8
st2 {v0.8b, v1.8b}, [x4], #16
yuv420sp_vu_row_loop__uv:
add x1, x1, x7
add x2, x2, x7
add x4, x4, x8
subs x5, x5, #1
bgt yuv420sp_vu_row_loop_uv
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20, [sp], #16
pop_v_regs
ret