| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| |
| ///* |
| ////---------------------------------------------------------------------------- |
| //// File Name : impeg2_format_conv.s |
| //// |
| //// Description : This file has the Idct Implementations for the |
| //// MPEG4 SP decoder on neon platform. |
| //// |
| //// Reference Document : |
| //// |
| //// Revision History : |
| //// Date Author Detail Description |
| //// ------------ ---------------- ---------------------------------- |
| //// Jul 07, 2008 Naveen Kumar T Created |
| //// |
| ////------------------------------------------------------------------------- |
| //*/ |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Include Files |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| .set log2_16 , 4 |
| .set log2_2 , 1 |
| |
| .text |
| .include "impeg2_neon_macros.s" |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Struct/Union Types and Define |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Static Global Data section variables |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| ////--------------------------- NONE -------------------------------------------- |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Static Prototype Functions |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| //// -------------------------- NONE -------------------------------------------- |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Exported functions |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| |
| |
| ///***************************************************************************** |
| //* * |
| //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * |
| //* * |
| //* Description : This function conversts the image from YUV420P color * |
| //* space to 420SP color space(UV interleaved). * |
| //* * |
| //* Arguments : x0 pu1_y * |
| //* x1 pu1_u * |
| //* x2 pu1_v * |
| //* x3 pu1_dest_y * |
| //* x4 pu1_dest_uv * |
| //* x5 u2_height * |
| //* x6 u2_width * |
| //* x7 u2_stridey * |
| //* sp, #80 u2_strideu * |
| //* sp, #88 u2_stridev * |
| //* sp, #96 u2_dest_stride_y * |
| //* sp, #104 u2_dest_stride_uv * |
| //* sp, #112 convert_uv_only * |
| //* * |
| //* Values Returned : None * |
| //* * |
| //* Register Usage : x8, x10, x16, x20, v0, v1 * |
| //* * |
| //* Stack Usage : 80 Bytes * |
| //* * |
| //* Interruptibility : Interruptible * |
| //* * |
| //* Known Limitations * |
| //* Assumptions: Image Width: Assumed to be multiple of 16 and * |
| //* greater than or equal to 16 * |
| //* Image Height: Assumed to be even. * |
| //* * |
| //* Revision History : * |
| //* DD MM YYYY Author(s) Changes (Describe the changes made) * |
| //* 07 06 2010 Varshita Draft * |
| //* 07 06 2010 Naveen Kr T Completed * |
| //* * |
| //*****************************************************************************/ |
| .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 |
| impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: |
| |
| //// push the registers on the stack |
| // pu1_y, - x0 |
| // pu1_u, - x1 |
| // pu1_v, - x2 |
| // pu1_dest_y, - x3 |
| // pu1_dest_uv, - x4 |
| // u2_height, - x5 |
| // u2_width, - x6 |
| // u2_stridey, - x7 |
| // u2_strideu, - sp, #80 |
| // u2_stridev, - sp, #88 |
| // u2_dest_stride_y, - sp, #96 |
| // u2_dest_stride_uv, - sp, #104 |
| // convert_uv_only - sp, #112 |
| // STMFD sp!,{x4-x12,x14} |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| ldr w14, [sp, #112] //// Load convert_uv_only |
| |
| cmp w14, #1 |
| beq yuv420sp_uv_chroma |
| ///* Do the preprocessing before the main loops start */ |
| //// Load the parameters from stack |
| |
| ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack |
| uxtw x8, w8 |
| |
| sub x7, x7, x6 //// Source increment |
| |
| sub x8, x8, x6 //// Destination increment |
| |
| |
| yuv420sp_uv_row_loop_y: |
| mov x16, x6 |
| |
| yuv420sp_uv_col_loop_y: |
| prfm pldl1keep, [x0, #128] |
| ld1 {v0.8b, v1.8b}, [x0], #16 |
| st1 {v0.8b, v1.8b}, [x3], #16 |
| sub x16, x16, #16 |
| cmp x16, #15 |
| bgt yuv420sp_uv_col_loop_y |
| |
| cmp x16, #0 |
| beq yuv420sp_uv_row_loop__y |
| ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read |
| ////Ex if width is 162, above loop will process 160 pixels. And |
| ////Both source and destination will point to 146th pixel and then 16 bytes will be read |
| //// and written using VLD1 and VST1 |
| sub x20, x16, #16 |
| neg x16, x20 |
| sub x0, x0, x16 |
| sub x3, x3, x16 |
| |
| ld1 {v0.8b, v1.8b}, [x0], #16 |
| st1 {v0.8b, v1.8b}, [x3], #16 |
| |
| yuv420sp_uv_row_loop__y: |
| add x0, x0, x7 |
| add x3, x3, x8 |
| subs x5, x5, #1 |
| bgt yuv420sp_uv_row_loop_y |
| |
| yuv420sp_uv_chroma: |
| ldr w7, [sp, #88] //// Load u2_strideu from stack |
| sxtw x7, w7 |
| |
| ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack |
| sxtw x8, w8 |
| add x6, x6, 1 |
| bic x6, x6, #1 |
| |
| add x9, x9, 1 |
| |
| sub x7, x7, x6, lsr #1 //// Source increment |
| |
| sub x8, x8, x6 //// Destination increment |
| |
| lsr x6, x6, #1 |
| lsr x5, x5, #1 |
| yuv420sp_uv_row_loop_uv: |
| mov x16, x6 |
| |
| |
| yuv420sp_uv_col_loop_uv: |
| prfm pldl1keep, [x1, #128] |
| prfm pldl1keep, [x2, #128] |
| |
| ld1 {v0.8b}, [x1], #8 |
| ld1 {v1.8b}, [x2], #8 |
| st2 {v0.8b, v1.8b}, [x4], #16 |
| |
| sub x16, x16, #8 |
| cmp x16, #7 |
| bgt yuv420sp_uv_col_loop_uv |
| |
| cmp x16, #0 |
| beq yuv420sp_uv_row_loop__uv |
| ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read |
| ////Ex if width is 162, above loop will process 160 pixels. And |
| ////Both source and destination will point to 146th pixel and then 16 bytes will be read |
| //// and written using VLD1 and VST1 |
| sub x20, x16, #8 |
| neg x16, x20 |
| sub x1, x1, x16 |
| sub x2, x2, x16 |
| sub x4, x4, x16, lsl #1 |
| |
| ld1 {v0.8b}, [x1], #8 |
| ld1 {v1.8b}, [x2], #8 |
| st2 {v0.8b, v1.8b}, [x4], #16 |
| |
| yuv420sp_uv_row_loop__uv: |
| add x1, x1, x7 |
| add x2, x2, x7 |
| add x4, x4, x8 |
| subs x5, x5, #1 |
| bgt yuv420sp_uv_row_loop_uv |
| ////POP THE REGISTERS |
| // LDMFD sp!,{x4-x12,PC} |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| ///***************************************************************************** |
| //* * |
| //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * |
| //* * |
| //* Description : This function conversts the image from YUV420P color * |
| //* space to 420SP color space(VU interleaved). * |
| //* This function is similar to above function * |
| //* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * |
| //* VLD1.8 for chroma - order of registers is different * |
| //* * |
| //* Arguments : x0 pu1_y * |
| //* x1 pu1_u * |
| //* x2 pu1_v * |
| //* x3 pu1_dest_y * |
| //* x4 pu1_dest_uv * |
| //* x5 u2_height * |
| //* x6 u2_width * |
| //* x7 u2_stridey * |
| //* sp, #80 u2_strideu * |
| //* sp, #88 u2_stridev * |
| //* sp, #96 u2_dest_stride_y * |
| //* sp, #104 u2_dest_stride_uv * |
| //* sp, #112 convert_uv_only * |
| //* * |
| //* Values Returned : None * |
| //* * |
| //* Register Usage : x8, x14, x16, x20, v0, v1 * |
| //* * |
| //* Stack Usage : 80 Bytes * |
| //* * |
| //* Interruptibility : Interruptible * |
| //* * |
| //* Known Limitations * |
| //* Assumptions: Image Width: Assumed to be multiple of 16 and * |
| //* greater than or equal to 16 * |
| //* Image Height: Assumed to be even. * |
| //* * |
| //* Revision History : * |
| //* DD MM YYYY Author(s) Changes (Describe the changes made) * |
| //* 07 06 2010 Varshita Draft * |
| //* 07 06 2010 Naveen Kr T Completed * |
| //* * |
| //*****************************************************************************/ |
| |
| .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 |
| impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: |
| |
| //// push the registers on the stack |
| // pu1_y, - x0 |
| // pu1_u, - x1 |
| // pu1_v, - x2 |
| // pu1_dest_y, - x3 |
| // pu1_dest_uv, - x4 |
| // u2_height, - x5 |
| // u2_width, - x6 |
| // u2_stridey, - x7 |
| // u2_strideu, - sp, #80 |
| // u2_stridev, - sp, #88 |
| // u2_dest_stride_y, - sp, #96 |
| // u2_dest_stride_uv, - sp, #104 |
| // convert_uv_only - sp, #112 |
| // STMFD sp!,{x4-x12,x14} |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| ldr w14, [sp, #112] //// Load convert_uv_only |
| |
| cmp w14, #1 |
| beq yuv420sp_vu_chroma |
| |
| ///* Do the preprocessing before the main loops start */ |
| //// Load the parameters from stack |
| |
| ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack |
| uxtw x8, w8 |
| |
| sub x7, x7, x6 //// Source increment |
| |
| sub x8, x8, x6 //// Destination increment |
| |
| |
| yuv420sp_vu_row_loop_y: |
| mov x16, x6 |
| |
| yuv420sp_vu_col_loop_y: |
| prfm pldl1keep, [x0, #128] |
| ld1 {v0.8b, v1.8b}, [x0], #16 |
| st1 {v0.8b, v1.8b}, [x3], #16 |
| sub x16, x16, #16 |
| cmp x16, #15 |
| bgt yuv420sp_vu_col_loop_y |
| |
| cmp x16, #0 |
| beq yuv420sp_vu_row_loop__y |
| ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read |
| ////Ex if width is 162, above loop will process 160 pixels. And |
| ////Both source and destination will point to 146th pixel and then 16 bytes will be read |
| //// and written using VLD1 and VST1 |
| sub x20, x16, #16 |
| neg x16, x20 |
| sub x0, x0, x16 |
| sub x3, x3, x16 |
| |
| ld1 {v0.8b, v1.8b}, [x0], #16 |
| st1 {v0.8b, v1.8b}, [x3], #16 |
| |
| yuv420sp_vu_row_loop__y: |
| add x0, x0, x7 |
| add x3, x3, x8 |
| subs x5, x5, #1 |
| bgt yuv420sp_vu_row_loop_y |
| |
| yuv420sp_vu_chroma: |
| ldr w7, [sp, #80] //// Load u2_strideu from stack |
| sxtw x7, w7 |
| |
| ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack |
| sxtw x8, w8 |
| |
| add x6, x6, 1 |
| bic x6, x6, #1 |
| |
| add x9, x9, 1 |
| |
| sub x7, x7, x6, lsr #1 //// Source increment |
| |
| sub x8, x8, x6 //// Destination increment |
| |
| lsr x6, x6, #1 |
| lsr x5, x5, #1 |
| yuv420sp_vu_row_loop_uv: |
| mov x16, x6 |
| |
| |
| yuv420sp_vu_col_loop_uv: |
| prfm pldl1keep, [x1, #128] |
| prfm pldl1keep, [x2, #128] |
| ld1 {v1.8b}, [x1], #8 |
| ld1 {v0.8b}, [x2], #8 |
| st2 {v0.8b, v1.8b}, [x4], #16 |
| sub x16, x16, #8 |
| cmp x16, #7 |
| bgt yuv420sp_vu_col_loop_uv |
| |
| cmp x16, #0 |
| beq yuv420sp_vu_row_loop__uv |
| ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read |
| ////Ex if width is 162, above loop will process 160 pixels. And |
| ////Both source and destination will point to 146th pixel and then 16 bytes will be read |
| //// and written using VLD1 and VST1 |
| sub x20, x16, #8 |
| neg x16, x20 |
| sub x1, x1, x16 |
| sub x2, x2, x16 |
| sub x4, x4, x16, lsl #1 |
| |
| ld1 {v1.8b}, [x1], #8 |
| ld1 {v0.8b}, [x2], #8 |
| st2 {v0.8b, v1.8b}, [x4], #16 |
| |
| yuv420sp_vu_row_loop__uv: |
| add x1, x1, x7 |
| add x2, x2, x7 |
| add x4, x4, x8 |
| subs x5, x5, #1 |
| bgt yuv420sp_vu_row_loop_uv |
| ////POP THE REGISTERS |
| // LDMFD sp!,{x4-x12,PC} |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |