encoder/arm/ih264e_fmt_conv.s - platform/external/libavc - Git at Google

 @/******************************************************************************
 @ *
 @ * Copyright (C) 2015 The Android Open Source Project
 @ *
 @ * Licensed under the Apache License, Version 2.0 (the "License");
 @ * you may not use this file except in compliance with the License.
 @ * You may obtain a copy of the License at:
 @ *
 @ * http://www.apache.org/licenses/LICENSE-2.0
 @ *
 @ * Unless required by applicable law or agreed to in writing, software
 @ * distributed under the License is distributed on an "AS IS" BASIS,
 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @ * See the License for the specific language governing permissions and
 @ * limitations under the License.
 @ *
 @ *****************************************************************************
 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 @*/

 .text
 .p2align 2

 @/*****************************************************************************
 @*                                                                            *
 @*  Function Name    : IH264D_CXA8_YUV420toYUV420SP_UV()                      *
 @*                                                                            *
 @*  Description      : This function conversts the image from YUV420P color   *
 @*                     space to 420SP color space(UV interleaved).        *
 @*                                                                            *
 @*  Arguments        : R0           pu1_y                                     *
 @*                     R1           pu1_u                                     *
 @*                     R2           pu1_v                                     *
 @*                     R3           pu1_dest_y                                *
 @*                     [R13 #40]    pu1_dest_uv                               *
 @*                     [R13 #44]    u2_height                                 *
 @*                     [R13 #48]    u2_width                                  *
 @*                     [R13 #52]    u2_stridey                                *
 @*                     [R13 #56]    u2_strideu                                *
 @*                     [R13 #60]    u2_stridev                                *
 @*                     [R13 #64]    u2_dest_stride_y                          *
 @*                     [R13 #68]    u2_dest_stride_uv                         *
 @*                     [R13 #72]    convert_uv_only                           *
 @*                                                                            *
 @*  Values Returned  : None                                                   *
 @*                                                                            *
 @*  Register Usage   : R0 - R14                                               *
 @*                                                                            *
 @*  Stack Usage      : 40 Bytes                                               *
 @*                                                                            *
 @*  Interruptibility : Interruptible                                          *
 @*                                                                            *
 @*  Known Limitations                                                         *
 @*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
 @*                     greater than or equal to 16                *
 @*                     Image Height:    Assumed to be even.                   *
 @*                                                                            *
 @*  Revision History :                                                        *
 @*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
 @*         07 06 2010   Varshita        Draft                                 *
 @*         07 06 2010   Naveen Kr T     Completed                             *
 @*                                                                            *
 @*****************************************************************************/
     .global ih264e_fmt_conv_420p_to_420sp_a9q

 ih264e_fmt_conv_420p_to_420sp_a9q:

     @// push the registers on the stack
     stmfd         sp!, {r4-r12, lr}

     ldr           r4, [sp, #72]         @// Load convert_uv_only

     cmp           r4, #1
     beq           yuv420sp_uv_chroma
     @/* Do the preprocessing before the main loops start */
     @// Load the parameters from stack
     ldr           r4, [sp, #44]         @// Load u2_height from stack
     ldr           r5, [sp, #48]         @// Load u2_width from stack
     ldr           r7, [sp, #52]         @// Load u2_stridey from stack
     ldr           r8, [sp, #64]         @// Load u2_dest_stride_y from stack
     sub           r7, r7, r5            @// Source increment
     sub           r8, r8, r5            @// Destination increment

 yuv420sp_uv_row_loop_y:
     mov           r6, r5

 yuv420sp_uv_col_loop_y:
     pld           [r0, #128]
     vld1.8        {d0, d1}, [r0]!
     vst1.8        {d0, d1}, [r3]!
     sub           r6, r6, #16
     cmp           r6, #15
     bgt           yuv420sp_uv_col_loop_y

     cmp           r6, #0
     beq           yuv420sp_uv_row_loop_end_y
     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     @//Ex if width is 162, above loop will process 160 pixels. And
     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
     @// and written using VLD1 and VST1
     rsb           r6, r6, #16
     sub           r0, r0, r6
     sub           r3, r3, r6

     vld1.8        {d0, d1}, [r0]!
     vst1.8        {d0, d1}, [r3]!

 yuv420sp_uv_row_loop_end_y:
     add           r0, r0, r7
     add           r3, r3, r8
     subs          r4, r4, #1
     bgt           yuv420sp_uv_row_loop_y

 yuv420sp_uv_chroma:

     ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

     ldr           r4, [sp, #44]         @// Load u2_height from stack

     ldr           r5, [sp, #48]         @// Load u2_width from stack


     ldr           r7, [sp, #56]         @// Load u2_strideu from stack

     ldr           r8, [sp, #68]         @// Load u2_dest_stride_uv from stack

     sub           r7, r7, r5, lsr #1    @// Source increment

     sub           r8, r8, r5            @// Destination increment

     mov           r5, r5, lsr #1
     mov           r4, r4, lsr #1
     ldr           r3, [sp, #40]         @// Load pu1_dest_uv from stack

 yuv420sp_uv_row_loop_uv:
     mov           r6, r5


 yuv420sp_uv_col_loop_uv:
     pld           [r1, #128]
     pld           [r2, #128]
     vld1.8        d0, [r1]!
     vld1.8        d1, [r2]!
     vst2.8        {d0, d1}, [r3]!
     sub           r6, r6, #8
     cmp           r6, #7
     bgt           yuv420sp_uv_col_loop_uv

     cmp           r6, #0
     beq           yuv420sp_uv_row_loop_end_uv
     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     @//Ex if width is 162, above loop will process 160 pixels. And
     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
     @// and written using VLD1 and VST1
     rsb           r6, r6, #8
     sub           r1, r1, r6
     sub           r2, r2, r6
     sub           r3, r3, r6, lsl #1

     vld1.8        d0, [r1]!
     vld1.8        d1, [r2]!
     vst2.8        {d0, d1}, [r3]!

 yuv420sp_uv_row_loop_end_uv:
     add           r1, r1, r7
     add           r2, r2, r7
     add           r3, r3, r8
     subs          r4, r4, #1
     bgt           yuv420sp_uv_row_loop_uv
     @//POP THE REGISTERS
     ldmfd         sp!, {r4-r12, pc}


 @ /**
 @ *******************************************************************************
 @ *
 @ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
 @ *     Function used from format conversion or frame copy
 @ *
 @ *
 @ *
 @ *Inputs             : r0 - pu1_y            -   UWORD8 pointer to y plane.
 @ *                     r1 - pu1_u            -   UWORD8 pointer to u plane.
 @ *                     r2 - pu1_v            -   UWORD8 pointer to u plane.
 @ *                     r3 - pu2_yuv422i      -   UWORD16 pointer to yuv422iimage.
 @ *             stack + 40 - u4_width         -   Width of the Y plane.
 @ *                     44 - u4_height        -   Height of the Y plane.
 @ *                     48 - u4_stride_y      -   Stride in pixels of Y plane.
 @ *                     52 - u4_stride_u      -   Stride in pixels of U plane.
 @ *                     56 - u4_stride_v      -   Stride in pixels of V plane.
 @ *                     60 - u4_stride_yuv422i-   Stride in pixels of yuv422i image.
 @ *
 @ * @par   Description
 @ * Function used from copying or converting a reference frame to display buffer
 @ * in non shared mode
 @ *
 @ * @param[in] pu1_y_dst
 @ *   Output Y pointer
 @ *
 @ * @param[in] pu1_u_dst
 @ *   Output U/UV pointer ( UV is interleaved in the same format as that of input)
 @ *
 @ * @param[in] pu1_v_dst
 @ *   Output V pointer ( used in 420P output case)
 @ *
 @ * @param[in] u4_dst_y_strd
 @ *   Stride of destination Y buffer
 @ *
 @ * @param[in] u4_dst_u_strd
 @ *   Stride of destination  U/V buffer
 @ *
 @ *
 @ * @param[in] blocking
 @ *   To indicate whether format conversion should wait till frame is reconstructed
 @ *   and then return after complete copy is done. To be set to 1 when called at the
 @ *   end of frame processing and set to 0 when called between frame processing modules
 @ *   in order to utilize available MCPS
 @ *
 @ * @returns Error from IH264E_ERROR_T
 @ *
 @ * @remarks
 @ * Assumes that the stride of U and V buffers are same.
 @ * This is correct in most cases
 @ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
 @ * Since we read 4 pixels ata time the width should be aligned to 4
 @ * In assembly width should be aligned to 16 and height to 2.
 @ *
 @ *
 @ * Revision History :
 @ *         DD MM YYYY   Author(s)              Changes (Describe the changes made)
 @ *         07 06 2010   Harinarayanan K K       Adapeted to 422p
 @ *
 @ *******************************************************************************
 @ */

 @//`
 @*/
     .global ih264e_fmt_conv_422i_to_420sp_a9q
 ih264e_fmt_conv_422i_to_420sp_a9q:
     stmfd         sp!, {r4-r12, lr}     @// Back the register which are used


     @/* Do the preprocessing before the main loops start */
     @// Load the parameters from stack
     ldr           r4, [sp, #48]         @// Load u4_stride_y       from stack

     ldr           r5, [sp, #60]         @// Load u4_stride_yuv422i from stack
     add           r6, r0, r4            @// pu1_y_nxt_row       = pu1_y + u4_stride_y

     ldr           r7, [sp, #40]         @// Load u4_width          from stack
     add           r8, r3, r5, lsl #1    @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)

     ldr           r9, [sp, #52]         @// Load u4_stride_u       from stack
     sub           r12, r4, r7           @// u2_offset1          = u4_stride_y - u4_width

 @LDR            r10,[sp,#56]                ;// Load u4_stride_v       from stack
     sub           r14, r5, r7           @// u2_offset_yuv422i   = u4_stride_yuv422i - u4_width

     ldr           r11, [sp, #44]        @// Load u4_height         from stack
     sub           r9, r9, r7            @// u2_offset2          = u4_stride_u - u4_width >> 1

 @   SUB         r10,r10,r7,ASR #1           ;// u2_offset3          = u4_stride_v - u4_width >> 1
     mov           r14, r14, lsl #1      @// u2_offset_yuv422i   = u2_offset_yuv422i * 2

     mov           r11, r11, asr #1      @// u4_width = u4_width / 2 (u4_width >> 1)

     add           r4, r12, r4           @// u2_offset1 = u2_offset1 + u4_stride_y
     add           r5, r14, r5, lsl #1   @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i

 @// Register Assignment
 @// pu1_y               - r0
 @// pu1_y_nxt_row       - r6
 @// pu1_u               - r1
 @// pu1_v               - r2
 @// pu2_yuv422i         - r3
 @// pu2_yuv422i_nxt_row - r8
 @// u2_offset1          - r4
 @// u2_offset2          - r9
 @// u2_offset3          - r10
 @// u2_offset_yuv422i   - r5
 @// u4_width / 16       - r7
 @// u4_height / 2       - r11
 @// inner loop count    - r12
 yuv422i_to_420sp_height_loop:

     mov           r12, r7               @// Inner loop count = u4_width / 16

 yuv422i_to_420sp_width_loop:
     vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
     vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
     sub           r12, r12, #16

     vrhadd.u8     d0, d0, d4
     vrhadd.u8     d2, d2, d6

     vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
     vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y

     vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U

     cmp           r12, #15
     bgt           yuv422i_to_420sp_width_loop
     cmp           r12, #0
     beq           yuv422i_to_420sp_row_loop_end

     @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     @//Ex if width is 162, above loop will process 160 pixels. And
     @//Both source and destination will point to 146th pixel and then 16 bytes will be read
     @// and written using VLD1 and VST1
     rsb           r12, r12, #16
     sub           r3, r3, r12, lsl #1
     sub           r8, r8, r12, lsl #1
     sub           r0, r0, r12
     sub           r6, r6, r12
     sub           r1, r1, r12

     vld4.8        {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
     vld4.8        {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2

     vrhadd.u8     d0, d0, d4
     vrhadd.u8     d2, d2, d6

     vst2.8        {d1, d3}, [r0]!       @// Store the 16 elements of row1 Y
     vst2.8        {d5, d7}, [r6]!       @// Store the 16 elements of row2 Y

     vst2.8        {d0, d2}, [r1]!       @// Store the 8 elements of row1/2 U

 yuv422i_to_420sp_row_loop_end:
     @// Update the buffer pointer so that they will refer to next pair of rows
     add           r0, r0, r4            @// pu1_y               = pu1_y                 + u2_offset1
     add           r6, r6, r4            @// pu1_y_nxt_row       = pu1_y_nxt_row         + u2_offset1

     add           r1, r1, r9            @// pu1_u               = pu1_u                 + u2_offset2
     subs          r11, r11, #1

     add           r3, r3, r5            @// pu2_yuv422i         = pu2_yuv422i           + u2_offset_yuv422i

     add           r8, r8, r5            @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row   + u2_offset_yuv422i
     bgt           yuv422i_to_420sp_height_loop
     ldmfd         sp!, {r4-r12, pc}     @// Restore the register which are used
	@/******************************************************************************
	@ *
	@ * Copyright (C) 2015 The Android Open Source Project
	@ *
	@ * Licensed under the Apache License, Version 2.0 (the "License");
	@ * you may not use this file except in compliance with the License.
	@ * You may obtain a copy of the License at:
	@ *
	@ * http://www.apache.org/licenses/LICENSE-2.0
	@ *
	@ * Unless required by applicable law or agreed to in writing, software
	@ * distributed under the License is distributed on an "AS IS" BASIS,
	@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	@ * See the License for the specific language governing permissions and
	@ * limitations under the License.
	@ *
	@ *****************************************************************************
	@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	@*/

	.text
	.p2align 2

	@/*****************************************************************************
	@* *
	@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() *
	@* *
	@* Description : This function conversts the image from YUV420P color *
	@* space to 420SP color space(UV interleaved). *
	@* *
	@* Arguments : R0 pu1_y *
	@* R1 pu1_u *
	@* R2 pu1_v *
	@* R3 pu1_dest_y *
	@* [R13 #40] pu1_dest_uv *
	@* [R13 #44] u2_height *
	@* [R13 #48] u2_width *
	@* [R13 #52] u2_stridey *
	@* [R13 #56] u2_strideu *
	@* [R13 #60] u2_stridev *
	@* [R13 #64] u2_dest_stride_y *
	@* [R13 #68] u2_dest_stride_uv *
	@* [R13 #72] convert_uv_only *
	@* *
	@* Values Returned : None *
	@* *
	@* Register Usage : R0 - R14 *
	@* *
	@* Stack Usage : 40 Bytes *
	@* *
	@* Interruptibility : Interruptible *
	@* *
	@* Known Limitations *
	@* Assumptions: Image Width: Assumed to be multiple of 16 and *
	@* greater than or equal to 16 *
	@* Image Height: Assumed to be even. *
	@* *
	@* Revision History : *
	@* DD MM YYYY Author(s) Changes (Describe the changes made) *
	@* 07 06 2010 Varshita Draft *
	@* 07 06 2010 Naveen Kr T Completed *
	@* *
	@*****************************************************************************/
	.global ih264e_fmt_conv_420p_to_420sp_a9q

	ih264e_fmt_conv_420p_to_420sp_a9q:

	@// push the registers on the stack
	stmfd sp!, {r4-r12, lr}

	ldr r4, [sp, #72] @// Load convert_uv_only

	cmp r4, #1
	beq yuv420sp_uv_chroma
	@/* Do the preprocessing before the main loops start */
	@// Load the parameters from stack
	ldr r4, [sp, #44] @// Load u2_height from stack
	ldr r5, [sp, #48] @// Load u2_width from stack
	ldr r7, [sp, #52] @// Load u2_stridey from stack
	ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack
	sub r7, r7, r5 @// Source increment
	sub r8, r8, r5 @// Destination increment

	yuv420sp_uv_row_loop_y:
	mov r6, r5

	yuv420sp_uv_col_loop_y:
	pld [r0, #128]
	vld1.8 {d0, d1}, [r0]!
	vst1.8 {d0, d1}, [r3]!
	sub r6, r6, #16
	cmp r6, #15
	bgt yuv420sp_uv_col_loop_y

	cmp r6, #0
	beq yuv420sp_uv_row_loop_end_y
	@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	@//Ex if width is 162, above loop will process 160 pixels. And
	@//Both source and destination will point to 146th pixel and then 16 bytes will be read
	@// and written using VLD1 and VST1
	rsb r6, r6, #16
	sub r0, r0, r6
	sub r3, r3, r6

	vld1.8 {d0, d1}, [r0]!
	vst1.8 {d0, d1}, [r3]!

	yuv420sp_uv_row_loop_end_y:
	add r0, r0, r7
	add r3, r3, r8
	subs r4, r4, #1
	bgt yuv420sp_uv_row_loop_y

	yuv420sp_uv_chroma:

	ldr r3, [sp, #40] @// Load pu1_dest_uv from stack

	ldr r4, [sp, #44] @// Load u2_height from stack

	ldr r5, [sp, #48] @// Load u2_width from stack


	ldr r7, [sp, #56] @// Load u2_strideu from stack

	ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack

	sub r7, r7, r5, lsr #1 @// Source increment

	sub r8, r8, r5 @// Destination increment

	mov r5, r5, lsr #1
	mov r4, r4, lsr #1
	ldr r3, [sp, #40] @// Load pu1_dest_uv from stack

	yuv420sp_uv_row_loop_uv:
	mov r6, r5


	yuv420sp_uv_col_loop_uv:
	pld [r1, #128]
	pld [r2, #128]
	vld1.8 d0, [r1]!
	vld1.8 d1, [r2]!
	vst2.8 {d0, d1}, [r3]!
	sub r6, r6, #8
	cmp r6, #7
	bgt yuv420sp_uv_col_loop_uv

	cmp r6, #0
	beq yuv420sp_uv_row_loop_end_uv
	@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	@//Ex if width is 162, above loop will process 160 pixels. And
	@//Both source and destination will point to 146th pixel and then 16 bytes will be read
	@// and written using VLD1 and VST1
	rsb r6, r6, #8
	sub r1, r1, r6
	sub r2, r2, r6
	sub r3, r3, r6, lsl #1

	vld1.8 d0, [r1]!
	vld1.8 d1, [r2]!
	vst2.8 {d0, d1}, [r3]!

	yuv420sp_uv_row_loop_end_uv:
	add r1, r1, r7
	add r2, r2, r7
	add r3, r3, r8
	subs r4, r4, #1
	bgt yuv420sp_uv_row_loop_uv
	@//POP THE REGISTERS
	ldmfd sp!, {r4-r12, pc}





	@ /**
	@ *******************************************************************************
	@ *
	@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
	@ * Function used from format conversion or frame copy
	@ *
	@ *
	@ *
	@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane.
	@ * r1 - pu1_u - UWORD8 pointer to u plane.
	@ * r2 - pu1_v - UWORD8 pointer to u plane.
	@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage.
	@ * stack + 40 - u4_width - Width of the Y plane.
	@ * 44 - u4_height - Height of the Y plane.
	@ * 48 - u4_stride_y - Stride in pixels of Y plane.
	@ * 52 - u4_stride_u - Stride in pixels of U plane.
	@ * 56 - u4_stride_v - Stride in pixels of V plane.
	@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image.
	@ *
	@ * @par Description
	@ * Function used from copying or converting a reference frame to display buffer
	@ * in non shared mode
	@ *
	@ * @param[in] pu1_y_dst
	@ * Output Y pointer
	@ *
	@ * @param[in] pu1_u_dst
	@ * Output U/UV pointer ( UV is interleaved in the same format as that of input)
	@ *
	@ * @param[in] pu1_v_dst
	@ * Output V pointer ( used in 420P output case)
	@ *
	@ * @param[in] u4_dst_y_strd
	@ * Stride of destination Y buffer
	@ *
	@ * @param[in] u4_dst_u_strd
	@ * Stride of destination U/V buffer
	@ *
	@ *
	@ * @param[in] blocking
	@ * To indicate whether format conversion should wait till frame is reconstructed
	@ * and then return after complete copy is done. To be set to 1 when called at the
	@ * end of frame processing and set to 0 when called between frame processing modules
	@ * in order to utilize available MCPS
	@ *
	@ * @returns Error from IH264E_ERROR_T
	@ *
	@ * @remarks
	@ * Assumes that the stride of U and V buffers are same.
	@ * This is correct in most cases
	@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
	@ * Since we read 4 pixels ata time the width should be aligned to 4
	@ * In assembly width should be aligned to 16 and height to 2.
	@ *
	@ *
	@ * Revision History :
	@ * DD MM YYYY Author(s) Changes (Describe the changes made)
	@ * 07 06 2010 Harinarayanan K K Adapeted to 422p
	@ *
	@ *******************************************************************************
	@ */

	@//`
	@*/
	.global ih264e_fmt_conv_422i_to_420sp_a9q
	ih264e_fmt_conv_422i_to_420sp_a9q:
	stmfd sp!, {r4-r12, lr} @// Back the register which are used



	@/* Do the preprocessing before the main loops start */
	@// Load the parameters from stack
	ldr r4, [sp, #48] @// Load u4_stride_y from stack

	ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack
	add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y

	ldr r7, [sp, #40] @// Load u4_width from stack
	add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)

	ldr r9, [sp, #52] @// Load u4_stride_u from stack
	sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width

	@LDR r10,[sp,#56] ;// Load u4_stride_v from stack
	sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width

	ldr r11, [sp, #44] @// Load u4_height from stack
	sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1

	@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1
	mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2

	mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1)

	add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y
	add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i

	@// Register Assignment
	@// pu1_y - r0
	@// pu1_y_nxt_row - r6
	@// pu1_u - r1
	@// pu1_v - r2
	@// pu2_yuv422i - r3
	@// pu2_yuv422i_nxt_row - r8
	@// u2_offset1 - r4
	@// u2_offset2 - r9
	@// u2_offset3 - r10
	@// u2_offset_yuv422i - r5
	@// u4_width / 16 - r7
	@// u4_height / 2 - r11
	@// inner loop count - r12
	yuv422i_to_420sp_height_loop:

	mov r12, r7 @// Inner loop count = u4_width / 16

	yuv422i_to_420sp_width_loop:
	vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
	vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
	sub r12, r12, #16

	vrhadd.u8 d0, d0, d4
	vrhadd.u8 d2, d2, d6

	vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
	vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y

	vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U

	cmp r12, #15
	bgt yuv422i_to_420sp_width_loop
	cmp r12, #0
	beq yuv422i_to_420sp_row_loop_end

	@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	@//Ex if width is 162, above loop will process 160 pixels. And
	@//Both source and destination will point to 146th pixel and then 16 bytes will be read
	@// and written using VLD1 and VST1
	rsb r12, r12, #16
	sub r3, r3, r12, lsl #1
	sub r8, r8, r12, lsl #1
	sub r0, r0, r12
	sub r6, r6, r12
	sub r1, r1, r12

	vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
	vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2

	vrhadd.u8 d0, d0, d4
	vrhadd.u8 d2, d2, d6

	vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
	vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y

	vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U

	yuv422i_to_420sp_row_loop_end:
	@// Update the buffer pointer so that they will refer to next pair of rows
	add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1
	add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1

	add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2
	subs r11, r11, #1

	add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i

	add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i
	bgt yuv422i_to_420sp_height_loop
	ldmfd sp!, {r4-r12, pc} @// Restore the register which are used