common/armv8/impeg2_format_conv.s - platform/external/libmpeg2 - Git at Google

 //******************************************************************************
 //*
 //* Copyright (C) 2015 The Android Open Source Project
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************
 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 //*/

 ///*
 ////----------------------------------------------------------------------------
 //// File Name            : impeg2_format_conv.s
 ////
 //// Description          : This file has the Idct Implementations for the
 ////                        MPEG4 SP decoder on neon platform.
 ////
 //// Reference Document   :
 ////
 //// Revision History     :
 ////      Date            Author                  Detail Description
 ////   ------------    ----------------    ----------------------------------
 ////   Jul 07, 2008     Naveen Kumar T                Created
 ////
 ////-------------------------------------------------------------------------
 //*/

 ///*
 //// ----------------------------------------------------------------------------
 //// Include Files
 //// ----------------------------------------------------------------------------
 //*/
 .set log2_16                    ,      4
 .set log2_2                     ,      1

 .text
 .include "impeg2_neon_macros.s"
 ///*
 //// ----------------------------------------------------------------------------
 //// Struct/Union Types and Define
 //// ----------------------------------------------------------------------------
 //*/

 ///*
 //// ----------------------------------------------------------------------------
 //// Static Global Data section variables
 //// ----------------------------------------------------------------------------
 //*/
 ////--------------------------- NONE --------------------------------------------

 ///*
 //// ----------------------------------------------------------------------------
 //// Static Prototype Functions
 //// ----------------------------------------------------------------------------
 //*/
 //// -------------------------- NONE --------------------------------------------

 ///*
 //// ----------------------------------------------------------------------------
 //// Exported functions
 //// ----------------------------------------------------------------------------
 //*/


 ///*****************************************************************************
 //*                                                                            *
 //*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8()                      *
 //*                                                                            *
 //*  Description      : This function conversts the image from YUV420P color   *
 //*                     space to 420SP color space(UV interleaved).           *
 //*                                                                            *
 //*  Arguments        : x0          pu1_y                                     *
 //*                     x1          pu1_u                                     *
 //*                     x2          pu1_v                                     *
 //*                     x3          pu1_dest_y                                *
 //*                     x4          pu1_dest_uv                               *
 //*                     x5          u2_height                                 *
 //*                     x6          u2_width                                  *
 //*                     x7          u2_stridey                                *
 //*                     sp, #80     u2_strideu                                *
 //*                     sp, #88     u2_stridev                                *
 //*                     sp, #96     u2_dest_stride_y                          *
 //*                     sp, #104    u2_dest_stride_uv                         *
 //*                     sp, #112    convert_uv_only                           *
 //*                                                                            *
 //*  Values Returned  : None                                                   *
 //*                                                                            *
 //*  Register Usage   : x8, x10, x16, x20, v0, v1                              *
 //*                                                                            *
 //*  Stack Usage      : 80 Bytes                                               *
 //*                                                                            *
 //*  Interruptibility : Interruptible                                          *
 //*                                                                            *
 //*  Known Limitations                                                         *
 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
 //*                     greater than or equal to 16                  *
 //*                     Image Height:    Assumed to be even.                   *
 //*                                                                            *
 //*  Revision History :                                                        *
 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
 //*         07 06 2010   Varshita        Draft                                 *
 //*         07 06 2010   Naveen Kr T     Completed                             *
 //*                                                                            *
 //*****************************************************************************/
 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
 impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:

     //// push the registers on the stack
     //    pu1_y,                - x0
     //    pu1_u,                - x1
     //    pu1_v,                - x2
     //    pu1_dest_y,           - x3
     //    pu1_dest_uv,          - x4
     //    u2_height,            - x5
     //    u2_width,             - x6
     //    u2_stridey,           - x7
     //    u2_strideu,           - sp, #80
     //    u2_stridev,           - sp, #88
     //    u2_dest_stride_y,     - sp, #96
     //    u2_dest_stride_uv,    - sp, #104
     //    convert_uv_only       - sp, #112
     // STMFD sp!,{x4-x12,x14}
     push_v_regs
     stp             x19, x20, [sp, #-16]!

     ldr             w14, [sp, #112]     //// Load convert_uv_only

     cmp             w14, #1
     beq             yuv420sp_uv_chroma
     ///* Do the preprocessing before the main loops start */
     //// Load the parameters from stack

     ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
     uxtw            x8, w8

     sub             x7, x7, x6          //// Source increment

     sub             x8, x8, x6          //// Destination increment


 yuv420sp_uv_row_loop_y:
     mov             x16, x6

 yuv420sp_uv_col_loop_y:
     prfm            pldl1keep, [x0, #128]
     ld1             {v0.8b, v1.8b}, [x0], #16
     st1             {v0.8b, v1.8b}, [x3], #16
     sub             x16, x16, #16
     cmp             x16, #15
     bgt             yuv420sp_uv_col_loop_y

     cmp             x16, #0
     beq             yuv420sp_uv_row_loop__y
     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     ////Ex if width is 162, above loop will process 160 pixels. And
     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
     //// and written using VLD1 and VST1
     sub             x20, x16, #16
     neg             x16, x20
     sub             x0, x0, x16
     sub             x3, x3, x16

     ld1             {v0.8b, v1.8b}, [x0], #16
     st1             {v0.8b, v1.8b}, [x3], #16

 yuv420sp_uv_row_loop__y:
     add             x0, x0, x7
     add             x3, x3, x8
     subs            x5, x5, #1
     bgt             yuv420sp_uv_row_loop_y

 yuv420sp_uv_chroma:
     ldr             w7, [sp, #88]       //// Load u2_strideu from stack
     sxtw            x7, w7

     ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
     sxtw            x8, w8
     add             x6, x6, 1
     bic             x6, x6, #1

     add             x9, x9, 1

     sub             x7, x7, x6, lsr #1  //// Source increment

     sub             x8, x8, x6          //// Destination increment

     lsr             x6, x6, #1
     lsr             x5, x5, #1
 yuv420sp_uv_row_loop_uv:
     mov             x16, x6


 yuv420sp_uv_col_loop_uv:
     prfm            pldl1keep, [x1, #128]
     prfm            pldl1keep, [x2, #128]

     ld1             {v0.8b}, [x1], #8
     ld1             {v1.8b}, [x2], #8
     st2             {v0.8b, v1.8b}, [x4], #16

     sub             x16, x16, #8
     cmp             x16, #7
     bgt             yuv420sp_uv_col_loop_uv

     cmp             x16, #0
     beq             yuv420sp_uv_row_loop__uv
     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     ////Ex if width is 162, above loop will process 160 pixels. And
     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
     //// and written using VLD1 and VST1
     sub             x20, x16, #8
     neg             x16, x20
     sub             x1, x1, x16
     sub             x2, x2, x16
     sub             x4, x4, x16, lsl #1

     ld1             {v0.8b}, [x1], #8
     ld1             {v1.8b}, [x2], #8
     st2             {v0.8b, v1.8b}, [x4], #16

 yuv420sp_uv_row_loop__uv:
     add             x1, x1, x7
     add             x2, x2, x7
     add             x4, x4, x8
     subs            x5, x5, #1
     bgt             yuv420sp_uv_row_loop_uv
     ////POP THE REGISTERS
     // LDMFD sp!,{x4-x12,PC}
     ldp             x19, x20, [sp], #16
     pop_v_regs
     ret


 ///*****************************************************************************
 //*                                                                            *
 //*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8()                      *
 //*                                                                            *
 //*  Description      : This function conversts the image from YUV420P color   *
 //*                     space to 420SP color space(VU interleaved).           *
 //*               This function is similar to above function          *
 //*               IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
 //*               VLD1.8 for chroma - order of registers is different    *
 //*                                                                            *
 //*  Arguments        : x0          pu1_y                                     *
 //*                     x1          pu1_u                                     *
 //*                     x2          pu1_v                                     *
 //*                     x3          pu1_dest_y                                *
 //*                     x4          pu1_dest_uv                               *
 //*                     x5          u2_height                                 *
 //*                     x6          u2_width                                  *
 //*                     x7          u2_stridey                                *
 //*                     sp, #80     u2_strideu                                *
 //*                     sp, #88     u2_stridev                                *
 //*                     sp, #96     u2_dest_stride_y                          *
 //*                     sp, #104    u2_dest_stride_uv                         *
 //*                     sp, #112    convert_uv_only                           *
 //*                                                                            *
 //*  Values Returned  : None                                                   *
 //*                                                                            *
 //*  Register Usage   : x8, x14, x16, x20, v0, v1                              *
 //*                                                                            *
 //*  Stack Usage      : 80 Bytes                                               *
 //*                                                                            *
 //*  Interruptibility : Interruptible                                          *
 //*                                                                            *
 //*  Known Limitations                                                         *
 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
 //*                     greater than or equal to 16                  *
 //*                     Image Height:    Assumed to be even.                   *
 //*                                                                            *
 //*  Revision History :                                                        *
 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
 //*         07 06 2010   Varshita        Draft                                 *
 //*         07 06 2010   Naveen Kr T     Completed                             *
 //*                                                                            *
 //*****************************************************************************/

 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
 impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:

     //// push the registers on the stack
     //    pu1_y,                - x0
     //    pu1_u,                - x1
     //    pu1_v,                - x2
     //    pu1_dest_y,           - x3
     //    pu1_dest_uv,          - x4
     //    u2_height,            - x5
     //    u2_width,             - x6
     //    u2_stridey,           - x7
     //    u2_strideu,           - sp, #80
     //    u2_stridev,           - sp, #88
     //    u2_dest_stride_y,     - sp, #96
     //    u2_dest_stride_uv,    - sp, #104
     //    convert_uv_only       - sp, #112
     // STMFD sp!,{x4-x12,x14}
     push_v_regs
     stp             x19, x20, [sp, #-16]!

     ldr             w14, [sp, #112]     //// Load convert_uv_only

     cmp             w14, #1
     beq             yuv420sp_vu_chroma

     ///* Do the preprocessing before the main loops start */
     //// Load the parameters from stack

     ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
     uxtw            x8, w8

     sub             x7, x7, x6          //// Source increment

     sub             x8, x8, x6          //// Destination increment


 yuv420sp_vu_row_loop_y:
     mov             x16, x6

 yuv420sp_vu_col_loop_y:
     prfm            pldl1keep, [x0, #128]
     ld1             {v0.8b, v1.8b}, [x0], #16
     st1             {v0.8b, v1.8b}, [x3], #16
     sub             x16, x16, #16
     cmp             x16, #15
     bgt             yuv420sp_vu_col_loop_y

     cmp             x16, #0
     beq             yuv420sp_vu_row_loop__y
     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     ////Ex if width is 162, above loop will process 160 pixels. And
     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
     //// and written using VLD1 and VST1
     sub             x20, x16, #16
     neg             x16, x20
     sub             x0, x0, x16
     sub             x3, x3, x16

     ld1             {v0.8b, v1.8b}, [x0], #16
     st1             {v0.8b, v1.8b}, [x3], #16

 yuv420sp_vu_row_loop__y:
     add             x0, x0, x7
     add             x3, x3, x8
     subs            x5, x5, #1
     bgt             yuv420sp_vu_row_loop_y

 yuv420sp_vu_chroma:
     ldr             w7, [sp, #80]       //// Load u2_strideu from stack
     sxtw            x7, w7

     ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
     sxtw            x8, w8

     add             x6, x6, 1
     bic             x6, x6, #1

     add             x9, x9, 1

     sub             x7, x7, x6, lsr #1  //// Source increment

     sub             x8, x8, x6          //// Destination increment

     lsr             x6, x6, #1
     lsr             x5, x5, #1
 yuv420sp_vu_row_loop_uv:
     mov             x16, x6


 yuv420sp_vu_col_loop_uv:
     prfm            pldl1keep, [x1, #128]
     prfm            pldl1keep, [x2, #128]
     ld1             {v1.8b}, [x1], #8
     ld1             {v0.8b}, [x2], #8
     st2             {v0.8b, v1.8b}, [x4], #16
     sub             x16, x16, #8
     cmp             x16, #7
     bgt             yuv420sp_vu_col_loop_uv

     cmp             x16, #0
     beq             yuv420sp_vu_row_loop__uv
     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
     ////Ex if width is 162, above loop will process 160 pixels. And
     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
     //// and written using VLD1 and VST1
     sub             x20, x16, #8
     neg             x16, x20
     sub             x1, x1, x16
     sub             x2, x2, x16
     sub             x4, x4, x16, lsl #1

     ld1             {v1.8b}, [x1], #8
     ld1             {v0.8b}, [x2], #8
     st2             {v0.8b, v1.8b}, [x4], #16

 yuv420sp_vu_row_loop__uv:
     add             x1, x1, x7
     add             x2, x2, x7
     add             x4, x4, x8
     subs            x5, x5, #1
     bgt             yuv420sp_vu_row_loop_uv
     ////POP THE REGISTERS
     // LDMFD sp!,{x4-x12,PC}
     ldp             x19, x20, [sp], #16
     pop_v_regs
     ret
	//******************************************************************************
	//*
	//* Copyright (C) 2015 The Android Open Source Project
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************
	//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	//*/

	///*
	////----------------------------------------------------------------------------
	//// File Name : impeg2_format_conv.s
	////
	//// Description : This file has the Idct Implementations for the
	//// MPEG4 SP decoder on neon platform.
	////
	//// Reference Document :
	////
	//// Revision History :
	//// Date Author Detail Description
	//// ------------ ---------------- ----------------------------------
	//// Jul 07, 2008 Naveen Kumar T Created
	////
	////-------------------------------------------------------------------------
	//*/

	///*
	//// ----------------------------------------------------------------------------
	//// Include Files
	//// ----------------------------------------------------------------------------
	//*/
	.set log2_16 , 4
	.set log2_2 , 1

	.text
	.include "impeg2_neon_macros.s"
	///*
	//// ----------------------------------------------------------------------------
	//// Struct/Union Types and Define
	//// ----------------------------------------------------------------------------
	//*/

	///*
	//// ----------------------------------------------------------------------------
	//// Static Global Data section variables
	//// ----------------------------------------------------------------------------
	//*/
	////--------------------------- NONE --------------------------------------------

	///*
	//// ----------------------------------------------------------------------------
	//// Static Prototype Functions
	//// ----------------------------------------------------------------------------
	//*/
	//// -------------------------- NONE --------------------------------------------

	///*
	//// ----------------------------------------------------------------------------
	//// Exported functions
	//// ----------------------------------------------------------------------------
	//*/


	///*****************************************************************************
	//* *
	//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() *
	//* *
	//* Description : This function conversts the image from YUV420P color *
	//* space to 420SP color space(UV interleaved). *
	//* *
	//* Arguments : x0 pu1_y *
	//* x1 pu1_u *
	//* x2 pu1_v *
	//* x3 pu1_dest_y *
	//* x4 pu1_dest_uv *
	//* x5 u2_height *
	//* x6 u2_width *
	//* x7 u2_stridey *
	//* sp, #80 u2_strideu *
	//* sp, #88 u2_stridev *
	//* sp, #96 u2_dest_stride_y *
	//* sp, #104 u2_dest_stride_uv *
	//* sp, #112 convert_uv_only *
	//* *
	//* Values Returned : None *
	//* *
	//* Register Usage : x8, x10, x16, x20, v0, v1 *
	//* *
	//* Stack Usage : 80 Bytes *
	//* *
	//* Interruptibility : Interruptible *
	//* *
	//* Known Limitations *
	//* Assumptions: Image Width: Assumed to be multiple of 16 and *
	//* greater than or equal to 16 *
	//* Image Height: Assumed to be even. *
	//* *
	//* Revision History : *
	//* DD MM YYYY Author(s) Changes (Describe the changes made) *
	//* 07 06 2010 Varshita Draft *
	//* 07 06 2010 Naveen Kr T Completed *
	//* *
	//*****************************************************************************/
	.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
	impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:

	//// push the registers on the stack
	// pu1_y, - x0
	// pu1_u, - x1
	// pu1_v, - x2
	// pu1_dest_y, - x3
	// pu1_dest_uv, - x4
	// u2_height, - x5
	// u2_width, - x6
	// u2_stridey, - x7
	// u2_strideu, - sp, #80
	// u2_stridev, - sp, #88
	// u2_dest_stride_y, - sp, #96
	// u2_dest_stride_uv, - sp, #104
	// convert_uv_only - sp, #112
	// STMFD sp!,{x4-x12,x14}
	push_v_regs
	stp x19, x20, [sp, #-16]!

	ldr w14, [sp, #112] //// Load convert_uv_only

	cmp w14, #1
	beq yuv420sp_uv_chroma
	///* Do the preprocessing before the main loops start */
	//// Load the parameters from stack

	ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
	uxtw x8, w8

	sub x7, x7, x6 //// Source increment

	sub x8, x8, x6 //// Destination increment


	yuv420sp_uv_row_loop_y:
	mov x16, x6

	yuv420sp_uv_col_loop_y:
	prfm pldl1keep, [x0, #128]
	ld1 {v0.8b, v1.8b}, [x0], #16
	st1 {v0.8b, v1.8b}, [x3], #16
	sub x16, x16, #16
	cmp x16, #15
	bgt yuv420sp_uv_col_loop_y

	cmp x16, #0
	beq yuv420sp_uv_row_loop__y
	////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	////Ex if width is 162, above loop will process 160 pixels. And
	////Both source and destination will point to 146th pixel and then 16 bytes will be read
	//// and written using VLD1 and VST1
	sub x20, x16, #16
	neg x16, x20
	sub x0, x0, x16
	sub x3, x3, x16

	ld1 {v0.8b, v1.8b}, [x0], #16
	st1 {v0.8b, v1.8b}, [x3], #16

	yuv420sp_uv_row_loop__y:
	add x0, x0, x7
	add x3, x3, x8
	subs x5, x5, #1
	bgt yuv420sp_uv_row_loop_y

	yuv420sp_uv_chroma:
	ldr w7, [sp, #88] //// Load u2_strideu from stack
	sxtw x7, w7

	ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
	sxtw x8, w8
	add x6, x6, 1
	bic x6, x6, #1

	add x9, x9, 1

	sub x7, x7, x6, lsr #1 //// Source increment

	sub x8, x8, x6 //// Destination increment

	lsr x6, x6, #1
	lsr x5, x5, #1
	yuv420sp_uv_row_loop_uv:
	mov x16, x6


	yuv420sp_uv_col_loop_uv:
	prfm pldl1keep, [x1, #128]
	prfm pldl1keep, [x2, #128]

	ld1 {v0.8b}, [x1], #8
	ld1 {v1.8b}, [x2], #8
	st2 {v0.8b, v1.8b}, [x4], #16

	sub x16, x16, #8
	cmp x16, #7
	bgt yuv420sp_uv_col_loop_uv

	cmp x16, #0
	beq yuv420sp_uv_row_loop__uv
	////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	////Ex if width is 162, above loop will process 160 pixels. And
	////Both source and destination will point to 146th pixel and then 16 bytes will be read
	//// and written using VLD1 and VST1
	sub x20, x16, #8
	neg x16, x20
	sub x1, x1, x16
	sub x2, x2, x16
	sub x4, x4, x16, lsl #1

	ld1 {v0.8b}, [x1], #8
	ld1 {v1.8b}, [x2], #8
	st2 {v0.8b, v1.8b}, [x4], #16

	yuv420sp_uv_row_loop__uv:
	add x1, x1, x7
	add x2, x2, x7
	add x4, x4, x8
	subs x5, x5, #1
	bgt yuv420sp_uv_row_loop_uv
	////POP THE REGISTERS
	// LDMFD sp!,{x4-x12,PC}
	ldp x19, x20, [sp], #16
	pop_v_regs
	ret





	///*****************************************************************************
	//* *
	//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() *
	//* *
	//* Description : This function conversts the image from YUV420P color *
	//* space to 420SP color space(VU interleaved). *
	//* This function is similar to above function *
	//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
	//* VLD1.8 for chroma - order of registers is different *
	//* *
	//* Arguments : x0 pu1_y *
	//* x1 pu1_u *
	//* x2 pu1_v *
	//* x3 pu1_dest_y *
	//* x4 pu1_dest_uv *
	//* x5 u2_height *
	//* x6 u2_width *
	//* x7 u2_stridey *
	//* sp, #80 u2_strideu *
	//* sp, #88 u2_stridev *
	//* sp, #96 u2_dest_stride_y *
	//* sp, #104 u2_dest_stride_uv *
	//* sp, #112 convert_uv_only *
	//* *
	//* Values Returned : None *
	//* *
	//* Register Usage : x8, x14, x16, x20, v0, v1 *
	//* *
	//* Stack Usage : 80 Bytes *
	//* *
	//* Interruptibility : Interruptible *
	//* *
	//* Known Limitations *
	//* Assumptions: Image Width: Assumed to be multiple of 16 and *
	//* greater than or equal to 16 *
	//* Image Height: Assumed to be even. *
	//* *
	//* Revision History : *
	//* DD MM YYYY Author(s) Changes (Describe the changes made) *
	//* 07 06 2010 Varshita Draft *
	//* 07 06 2010 Naveen Kr T Completed *
	//* *
	//*****************************************************************************/

	.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
	impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:

	//// push the registers on the stack
	// pu1_y, - x0
	// pu1_u, - x1
	// pu1_v, - x2
	// pu1_dest_y, - x3
	// pu1_dest_uv, - x4
	// u2_height, - x5
	// u2_width, - x6
	// u2_stridey, - x7
	// u2_strideu, - sp, #80
	// u2_stridev, - sp, #88
	// u2_dest_stride_y, - sp, #96
	// u2_dest_stride_uv, - sp, #104
	// convert_uv_only - sp, #112
	// STMFD sp!,{x4-x12,x14}
	push_v_regs
	stp x19, x20, [sp, #-16]!

	ldr w14, [sp, #112] //// Load convert_uv_only

	cmp w14, #1
	beq yuv420sp_vu_chroma

	///* Do the preprocessing before the main loops start */
	//// Load the parameters from stack

	ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
	uxtw x8, w8

	sub x7, x7, x6 //// Source increment

	sub x8, x8, x6 //// Destination increment


	yuv420sp_vu_row_loop_y:
	mov x16, x6

	yuv420sp_vu_col_loop_y:
	prfm pldl1keep, [x0, #128]
	ld1 {v0.8b, v1.8b}, [x0], #16
	st1 {v0.8b, v1.8b}, [x3], #16
	sub x16, x16, #16
	cmp x16, #15
	bgt yuv420sp_vu_col_loop_y

	cmp x16, #0
	beq yuv420sp_vu_row_loop__y
	////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	////Ex if width is 162, above loop will process 160 pixels. And
	////Both source and destination will point to 146th pixel and then 16 bytes will be read
	//// and written using VLD1 and VST1
	sub x20, x16, #16
	neg x16, x20
	sub x0, x0, x16
	sub x3, x3, x16

	ld1 {v0.8b, v1.8b}, [x0], #16
	st1 {v0.8b, v1.8b}, [x3], #16

	yuv420sp_vu_row_loop__y:
	add x0, x0, x7
	add x3, x3, x8
	subs x5, x5, #1
	bgt yuv420sp_vu_row_loop_y

	yuv420sp_vu_chroma:
	ldr w7, [sp, #80] //// Load u2_strideu from stack
	sxtw x7, w7

	ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
	sxtw x8, w8

	add x6, x6, 1
	bic x6, x6, #1

	add x9, x9, 1

	sub x7, x7, x6, lsr #1 //// Source increment

	sub x8, x8, x6 //// Destination increment

	lsr x6, x6, #1
	lsr x5, x5, #1
	yuv420sp_vu_row_loop_uv:
	mov x16, x6


	yuv420sp_vu_col_loop_uv:
	prfm pldl1keep, [x1, #128]
	prfm pldl1keep, [x2, #128]
	ld1 {v1.8b}, [x1], #8
	ld1 {v0.8b}, [x2], #8
	st2 {v0.8b, v1.8b}, [x4], #16
	sub x16, x16, #8
	cmp x16, #7
	bgt yuv420sp_vu_col_loop_uv

	cmp x16, #0
	beq yuv420sp_vu_row_loop__uv
	////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
	////Ex if width is 162, above loop will process 160 pixels. And
	////Both source and destination will point to 146th pixel and then 16 bytes will be read
	//// and written using VLD1 and VST1
	sub x20, x16, #8
	neg x16, x20
	sub x1, x1, x16
	sub x2, x2, x16
	sub x4, x4, x16, lsl #1

	ld1 {v1.8b}, [x1], #8
	ld1 {v0.8b}, [x2], #8
	st2 {v0.8b, v1.8b}, [x4], #16

	yuv420sp_vu_row_loop__uv:
	add x1, x1, x7
	add x2, x2, x7
	add x4, x4, x8
	subs x5, x5, #1
	bgt yuv420sp_vu_row_loop_uv
	////POP THE REGISTERS
	// LDMFD sp!,{x4-x12,PC}
	ldp x19, x20, [sp], #16
	pop_v_regs
	ret