blob: bd9a81a8e1a85db31e77bc24df400ee94a948899 [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@/*
@//----------------------------------------------------------------------------
@// File Name : impeg2_format_conv.s
@//
@// Description : This file has the Idct Implementations for the
@// MPEG4 SP decoder on neon platform.
@//
@// Reference Document :
@//
@// Revision History :
@// Date Author Detail Description
@// ------------ ---------------- ----------------------------------
@// Jul 07, 2008 Naveen Kumar T Created
@//
@//-------------------------------------------------------------------------
@*/
@/*
@// ----------------------------------------------------------------------------
@// Include Files
@// ----------------------------------------------------------------------------
@*/
.text
.p2align 2
.equ log2_16 , 4
.equ log2_2 , 1
@/*
@// ----------------------------------------------------------------------------
@// Struct/Union Types and Define
@// ----------------------------------------------------------------------------
@*/
@/*
@// ----------------------------------------------------------------------------
@// Static Global Data section variables
@// ----------------------------------------------------------------------------
@*/
@//--------------------------- NONE --------------------------------------------
@/*
@// ----------------------------------------------------------------------------
@// Static Prototype Functions
@// ----------------------------------------------------------------------------
@*/
@// -------------------------- NONE --------------------------------------------
@/*
@// ----------------------------------------------------------------------------
@// Exported functions
@// ----------------------------------------------------------------------------
@*/
@/*****************************************************************************
@* *
@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q() *
@* *
@* Description : This function conversts the image from YUV420P color *
@* space to 420SP color space(UV interleaved). *
@* *
@* Arguments : R0 pu1_y *
@* R1 pu1_u *
@* R2 pu1_v *
@* R3 pu1_dest_y *
@* [R13 #40] pu1_dest_uv *
@* [R13 #44] u2_height *
@* [R13 #48] u2_width *
@* [R13 #52] u2_stridey *
@* [R13 #56] u2_strideu *
@* [R13 #60] u2_stridev *
@* [R13 #64] u2_dest_stride_y *
@* [R13 #68] u2_dest_stride_uv *
@* [R13 #72] convert_uv_only *
@* *
@* Values Returned : None *
@* *
@* Register Usage : R0 - R8, Q0 *
@* *
@* Stack Usage : 24 Bytes *
@* *
@* Interruptibility : Interruptible *
@* *
@* Known Limitations *
@* Assumptions: Image Width: Assumed to be multiple of 16 and *
@* greater than or equal to 16 *
@* Image Height: Assumed to be even. *
@* *
@* Revision History : *
@* DD MM YYYY Author(s) Changes (Describe the changes made) *
@* 07 06 2010 Varshita Draft *
@* 07 06 2010 Naveen Kr T Completed *
@* *
@*****************************************************************************/
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:
@// push the registers on the stack
stmfd sp!, {r4-r8, lr}
ldr r4, [sp, #56] @// Load convert_uv_only
cmp r4, #1
beq yuv420sp_uv_chroma
@/* Do the preprocessing before the main loops start */
@// Load the parameters from stack
ldr r4, [sp, #28] @// Load u2_height from stack
ldr r5, [sp, #32] @// Load u2_width from stack
ldr r7, [sp, #36] @// Load u2_stridey from stack
ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack
sub r7, r7, r5 @// Source increment
sub r8, r8, r5 @// Destination increment
yuv420sp_uv_row_loop_y:
mov r6, r5
yuv420sp_uv_col_loop_y:
pld [r0, #128]
vld1.8 {q0}, [r0]!
vst1.8 {q0}, [r3]!
sub r6, r6, #16
cmp r6, #15
bgt yuv420sp_uv_col_loop_y
cmp r6, #0
beq yuv420sp_uv_row_loop_end_y
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #16
sub r0, r0, r6
sub r3, r3, r6
vld1.8 {q0}, [r0]!
vst1.8 {q0}, [r3]!
yuv420sp_uv_row_loop_end_y:
add r0, r0, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_uv_row_loop_y
yuv420sp_uv_chroma:
ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
ldr r4, [sp, #28] @// Load u2_height from stack
add r4, r4, 1
ldr r5, [sp, #32] @// Load u2_width from stack
add r5, r5, 1
bic r5, r5, #1
ldr r7, [sp, #40] @// Load u2_strideu from stack
ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack
sub r7, r7, r5, lsr #1 @// Source increment
sub r8, r8, r5 @// Destination increment
mov r5, r5, lsr #1
mov r4, r4, lsr #1
ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
yuv420sp_uv_row_loop_uv:
mov r6, r5
yuv420sp_uv_col_loop_uv:
pld [r1, #128]
pld [r2, #128]
vld1.8 d0, [r1]!
vld1.8 d1, [r2]!
vst2.8 {d0, d1}, [r3]!
sub r6, r6, #8
cmp r6, #7
bgt yuv420sp_uv_col_loop_uv
cmp r6, #0
beq yuv420sp_uv_row_loop_end_uv
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #8
sub r1, r1, r6
sub r2, r2, r6
sub r3, r3, r6, lsl #1
vld1.8 d0, [r1]!
vld1.8 d1, [r2]!
vst2.8 {d0, d1}, [r3]!
yuv420sp_uv_row_loop_end_uv:
add r1, r1, r7
add r2, r2, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_uv_row_loop_uv
@//POP THE REGISTERS
ldmfd sp!, {r4-r8, pc}
@/*****************************************************************************
@* *
@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q() *
@* *
@* Description : This function conversts the image from YUV420P color *
@* space to 420SP color space(VU interleaved). *
@* This function is similar to above function *
@* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
@* VLD1.8 for chroma - order of registers is different *
@* *
@* Arguments : R0 pu1_y *
@* R1 pu1_u *
@* R2 pu1_v *
@* R3 pu1_dest_y *
@* [R13 #40] pu1_dest_uv *
@* [R13 #44] u2_height *
@* [R13 #48] u2_width *
@* [R13 #52] u2_stridey *
@* [R13 #56] u2_strideu *
@* [R13 #60] u2_stridev *
@* [R13 #64] u2_dest_stride_y *
@* [R13 #68] u2_dest_stride_uv *
@* [R13 #72] convert_uv_only *
@* *
@* Values Returned : None *
@* *
@* Register Usage : R0 - R8, Q0 *
@* *
@* Stack Usage : 24 Bytes *
@* *
@* Interruptibility : Interruptible *
@* *
@* Known Limitations *
@* Assumptions: Image Width: Assumed to be multiple of 16 and *
@* greater than or equal to 16 *
@* Image Height: Assumed to be even. *
@* *
@* Revision History : *
@* DD MM YYYY Author(s) Changes (Describe the changes made) *
@* 07 06 2010 Varshita Draft *
@* 07 06 2010 Naveen Kr T Completed *
@* *
@*****************************************************************************/
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:
@// push the registers on the stack
stmfd sp!, {r4-r8, lr}
ldr r4, [sp, #56] @// Load convert_uv_only
cmp r4, #1
beq yuv420sp_vu_chroma
@/* Do the preprocessing before the main loops start */
@// Load the parameters from stack
ldr r4, [sp, #28] @// Load u2_height from stack
ldr r5, [sp, #32] @// Load u2_width from stack
ldr r7, [sp, #36] @// Load u2_stridey from stack
ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack
sub r7, r7, r5 @// Source increment
sub r8, r8, r5 @// Destination increment
yuv420sp_vu_row_loop_y:
mov r6, r5
yuv420sp_vu_col_loop_y:
pld [r0, #128]
vld1.8 {q0}, [r0]!
vst1.8 {q0}, [r3]!
sub r6, r6, #16
cmp r6, #15
bgt yuv420sp_vu_col_loop_y
cmp r6, #0
beq yuv420sp_vu_row_loop_end_y
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #16
sub r0, r0, r6
sub r3, r3, r6
vld1.8 {q0}, [r0]!
vst1.8 {q0}, [r3]!
yuv420sp_vu_row_loop_end_y:
add r0, r0, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_vu_row_loop_y
yuv420sp_vu_chroma:
ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
ldr r4, [sp, #28] @// Load u2_height from stack
add r4, r4, 1
ldr r5, [sp, #32] @// Load u2_width from stack
add r5, r5, 1
bic r5, r5, #1
ldr r7, [sp, #40] @// Load u2_strideu from stack
ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack
sub r7, r7, r5, lsr #1 @// Source increment
sub r8, r8, r5 @// Destination increment
mov r5, r5, lsr #1
mov r4, r4, lsr #1
ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
yuv420sp_vu_row_loop_uv:
mov r6, r5
yuv420sp_vu_col_loop_uv:
pld [r1, #128]
pld [r2, #128]
vld1.8 d1, [r1]!
vld1.8 d0, [r2]!
vst2.8 {d0, d1}, [r3]!
sub r6, r6, #8
cmp r6, #7
bgt yuv420sp_vu_col_loop_uv
cmp r6, #0
beq yuv420sp_vu_row_loop_end_uv
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #8
sub r1, r1, r6
sub r2, r2, r6
sub r3, r3, r6, lsl #1
vld1.8 d1, [r1]!
vld1.8 d0, [r2]!
vst2.8 {d0, d1}, [r3]!
yuv420sp_vu_row_loop_end_uv:
add r1, r1, r7
add r2, r2, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_vu_row_loop_uv
@//POP THE REGISTERS
ldmfd sp!, {r4-r8, pc}