blob: f8f5e4217cff2ded5570f52f027365012ec01c13 [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
.text
.p2align 2
@/*****************************************************************************
@* *
@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() *
@* *
@* Description : This function conversts the image from YUV420P color *
@* space to 420SP color space(UV interleaved). *
@* *
@* Arguments : R0 pu1_y *
@* R1 pu1_u *
@* R2 pu1_v *
@* R3 pu1_dest_y *
@* [R13 #40] pu1_dest_uv *
@* [R13 #44] u2_height *
@* [R13 #48] u2_width *
@* [R13 #52] u2_stridey *
@* [R13 #56] u2_strideu *
@* [R13 #60] u2_stridev *
@* [R13 #64] u2_dest_stride_y *
@* [R13 #68] u2_dest_stride_uv *
@* [R13 #72] convert_uv_only *
@* *
@* Values Returned : None *
@* *
@* Register Usage : R0 - R14 *
@* *
@* Stack Usage : 40 Bytes *
@* *
@* Interruptibility : Interruptible *
@* *
@* Known Limitations *
@* Assumptions: Image Width: Assumed to be multiple of 16 and *
@* greater than or equal to 16 *
@* Image Height: Assumed to be even. *
@* *
@* Revision History : *
@* DD MM YYYY Author(s) Changes (Describe the changes made) *
@* 07 06 2010 Varshita Draft *
@* 07 06 2010 Naveen Kr T Completed *
@* *
@*****************************************************************************/
.global ih264e_fmt_conv_420p_to_420sp_a9q
ih264e_fmt_conv_420p_to_420sp_a9q:
@// push the registers on the stack
stmfd sp!, {r4-r12, lr}
ldr r4, [sp, #72] @// Load convert_uv_only
cmp r4, #1
beq yuv420sp_uv_chroma
@/* Do the preprocessing before the main loops start */
@// Load the parameters from stack
ldr r4, [sp, #44] @// Load u2_height from stack
ldr r5, [sp, #48] @// Load u2_width from stack
ldr r7, [sp, #52] @// Load u2_stridey from stack
ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack
sub r7, r7, r5 @// Source increment
sub r8, r8, r5 @// Destination increment
yuv420sp_uv_row_loop_y:
mov r6, r5
yuv420sp_uv_col_loop_y:
pld [r0, #128]
vld1.8 {d0, d1}, [r0]!
vst1.8 {d0, d1}, [r3]!
sub r6, r6, #16
cmp r6, #15
bgt yuv420sp_uv_col_loop_y
cmp r6, #0
beq yuv420sp_uv_row_loop_end_y
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #16
sub r0, r0, r6
sub r3, r3, r6
vld1.8 {d0, d1}, [r0]!
vst1.8 {d0, d1}, [r3]!
yuv420sp_uv_row_loop_end_y:
add r0, r0, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_uv_row_loop_y
yuv420sp_uv_chroma:
ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
ldr r4, [sp, #44] @// Load u2_height from stack
ldr r5, [sp, #48] @// Load u2_width from stack
ldr r7, [sp, #56] @// Load u2_strideu from stack
ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack
sub r7, r7, r5, lsr #1 @// Source increment
sub r8, r8, r5 @// Destination increment
mov r5, r5, lsr #1
mov r4, r4, lsr #1
ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
yuv420sp_uv_row_loop_uv:
mov r6, r5
yuv420sp_uv_col_loop_uv:
pld [r1, #128]
pld [r2, #128]
vld1.8 d0, [r1]!
vld1.8 d1, [r2]!
vst2.8 {d0, d1}, [r3]!
sub r6, r6, #8
cmp r6, #7
bgt yuv420sp_uv_col_loop_uv
cmp r6, #0
beq yuv420sp_uv_row_loop_end_uv
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r6, r6, #8
sub r1, r1, r6
sub r2, r2, r6
sub r3, r3, r6, lsl #1
vld1.8 d0, [r1]!
vld1.8 d1, [r2]!
vst2.8 {d0, d1}, [r3]!
yuv420sp_uv_row_loop_end_uv:
add r1, r1, r7
add r2, r2, r7
add r3, r3, r8
subs r4, r4, #1
bgt yuv420sp_uv_row_loop_uv
@//POP THE REGISTERS
ldmfd sp!, {r4-r12, pc}
@ /**
@ *******************************************************************************
@ *
@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
@ * Function used from format conversion or frame copy
@ *
@ *
@ *
@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane.
@ * r1 - pu1_u - UWORD8 pointer to u plane.
@ * r2 - pu1_v - UWORD8 pointer to u plane.
@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage.
@ * stack + 40 - u4_width - Width of the Y plane.
@ * 44 - u4_height - Height of the Y plane.
@ * 48 - u4_stride_y - Stride in pixels of Y plane.
@ * 52 - u4_stride_u - Stride in pixels of U plane.
@ * 56 - u4_stride_v - Stride in pixels of V plane.
@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image.
@ *
@ * @par Description
@ * Function used from copying or converting a reference frame to display buffer
@ * in non shared mode
@ *
@ * @param[in] pu1_y_dst
@ * Output Y pointer
@ *
@ * @param[in] pu1_u_dst
@ * Output U/UV pointer ( UV is interleaved in the same format as that of input)
@ *
@ * @param[in] pu1_v_dst
@ * Output V pointer ( used in 420P output case)
@ *
@ * @param[in] u4_dst_y_strd
@ * Stride of destination Y buffer
@ *
@ * @param[in] u4_dst_u_strd
@ * Stride of destination U/V buffer
@ *
@ *
@ * @param[in] blocking
@ * To indicate whether format conversion should wait till frame is reconstructed
@ * and then return after complete copy is done. To be set to 1 when called at the
@ * end of frame processing and set to 0 when called between frame processing modules
@ * in order to utilize available MCPS
@ *
@ * @returns Error from IH264E_ERROR_T
@ *
@ * @remarks
@ * Assumes that the stride of U and V buffers are same.
@ * This is correct in most cases
@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
@ * Since we read 4 pixels ata time the width should be aligned to 4
@ * In assembly width should be aligned to 16 and height to 2.
@ *
@ *
@ * Revision History :
@ * DD MM YYYY Author(s) Changes (Describe the changes made)
@ * 07 06 2010 Harinarayanan K K Adapeted to 422p
@ *
@ *******************************************************************************
@ */
@//`
@*/
.global ih264e_fmt_conv_422i_to_420sp_a9q
ih264e_fmt_conv_422i_to_420sp_a9q:
stmfd sp!, {r4-r12, lr} @// Back the register which are used
@/* Do the preprocessing before the main loops start */
@// Load the parameters from stack
ldr r4, [sp, #48] @// Load u4_stride_y from stack
ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack
add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y
ldr r7, [sp, #40] @// Load u4_width from stack
add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
ldr r9, [sp, #52] @// Load u4_stride_u from stack
sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width
@LDR r10,[sp,#56] ;// Load u4_stride_v from stack
sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width
ldr r11, [sp, #44] @// Load u4_height from stack
sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1
@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1
mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2
mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1)
add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y
add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
@// Register Assignment
@// pu1_y - r0
@// pu1_y_nxt_row - r6
@// pu1_u - r1
@// pu1_v - r2
@// pu2_yuv422i - r3
@// pu2_yuv422i_nxt_row - r8
@// u2_offset1 - r4
@// u2_offset2 - r9
@// u2_offset3 - r10
@// u2_offset_yuv422i - r5
@// u4_width / 16 - r7
@// u4_height / 2 - r11
@// inner loop count - r12
yuv422i_to_420sp_height_loop:
mov r12, r7 @// Inner loop count = u4_width / 16
yuv422i_to_420sp_width_loop:
vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
sub r12, r12, #16
vrhadd.u8 d0, d0, d4
vrhadd.u8 d2, d2, d6
vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
cmp r12, #15
bgt yuv422i_to_420sp_width_loop
cmp r12, #0
beq yuv422i_to_420sp_row_loop_end
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
@//Ex if width is 162, above loop will process 160 pixels. And
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
@// and written using VLD1 and VST1
rsb r12, r12, #16
sub r3, r3, r12, lsl #1
sub r8, r8, r12, lsl #1
sub r0, r0, r12
sub r6, r6, r12
sub r1, r1, r12
vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
vrhadd.u8 d0, d0, d4
vrhadd.u8 d2, d2, d6
vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
yuv422i_to_420sp_row_loop_end:
@// Update the buffer pointer so that they will refer to next pair of rows
add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1
add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1
add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2
subs r11, r11, #1
add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i
add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i
bgt yuv422i_to_420sp_height_loop
ldmfd sp!, {r4-r12, pc} @// Restore the register which are used