decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 ///*******************************************************************************
 //* //file
 //*  ihevcd_fmt_conv_420sp_to_rgba8888.s
 //*
 //* //brief
 //*  contains function definitions for format conversions
 //*
 //* //author
 //*  ittiam
 //*
 //* //par list of functions:
 //*
 //*
 //* //remarks
 //*  none
 //*
 //*******************************************************************************/

     .equ DO1STROUNDING, 0

     // ARM
     //
     // PRESERVE8

 .text
 .p2align 2

 .include "ihevc_neon_macros.s"


 ///*****************************************************************************
 //*                                                                            *
 //*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
 //*                                                                            *
 //*  Description      : This function conversts the image from YUV422 color    *
 //*                     space to RGB888 color space. The function can be       *
 //*                     invoked at the MB level.                               *
 //*                                                                            *
 //*  Arguments        : x0           pubY                                      *
 //*                     x1           pubUV                                     *
 //*                     x2           pusRGB                                    *
 //*                     x3           pusRGB                                    *
 //*                     [x13 #40]    usHeight                                  *
 //*                     [x13 #44]    usWidth                                   *
 //*                     [x13 #48]    usStrideY                                 *
 //*                     [x13 #52]    usStrideU                                 *
 //*                     [x13 #56]    usStrideV                                 *
 //*                     [x13 #60]    usStrideRGB                               *
 //*                                                                            *
 //*  Values Returned  : None                                                   *
 //*                                                                            *
 //*  Register Usage   : x0 - x14                                               *
 //*                                                                            *
 //*  Stack Usage      : 40 Bytes                                               *
 //*                                                                            *
 //*  Interruptibility : Interruptible                                          *
 //*                                                                            *
 //*  Known Limitations                                                         *
 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
 //*                     greater than or equal to 16                  *
 //*                     Image Height:    Assumed to be even.                   *
 //*                                                                            *
 //*  Revision History :                                                        *
 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
 //*         07 06 2010   Varshita        Draft                                 *
 //*         07 06 2010   Naveen Kr T     Completed                             *
 //*         05 08 2013   Naveen K P      Modified for HEVC                     *
 //*****************************************************************************/
     .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
 .type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
 ihevcd_fmt_conv_420sp_to_rgba8888_av8:

     //// push the registers on the stack
     // STMFD sp!,{x4-x12,x14}

     stp         d12,d14,[sp,#-16]!
     stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
                                             // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     stp         x19, x20,[sp,#-16]!


     ////x0 - Y PTR
     ////x1 - UV PTR
     ////x2 - RGB PTR
     ////x3 - RGB PTR
     ////x4 - PIC WIDTH
     ////x5 - PIC HT
     ////x6 - STRIDE Y
     ////x7 - STRIDE U
     ////x8 - STRIDE V
     ////x9 - STRIDE RGB

     ////ONE ROW PROCESSING AT A TIME

     ////THE FOUR CONSTANTS ARE:
     ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092

     //PLD        [x0]
     //PLD        [x1]
     //PLD        [x2]


     ///* can be loaded from a defined const type */
     mov         x10,#0x3311
     mov         v0.h[0], w10               ////C1

     mov         x10,#0xF379
     mov         v0.h[1], w10               ////C2

     mov         x10,#0xE5F8
     mov         v0.h[2], w10               ////C3

     mov         x10,#0x4092
     mov         v0.h[3], w10               ////C4

     ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
     MOV         x10,#128
     dup         v1.8b,w10

     ////D0 HAS C1-C2-C3-C4
     //// load other parameters from stack
     mov         x9, x7
     mov         x7, x6
     mov         x6, x5
     mov         x5, x4
     //LDR  x4,[sp,#44]
     //LDR  x8,[sp,#52]

     //// calculate offsets, offset = stride - width
     SUB         x10,x6,x3                   //// luma offset
     SUB         x11,x7,x3
     //, LSR #1    @// u offset
     //SUB     x12,x8,x3, LSR #1    @// v offset
     SUB         x14,x9,x3                   //// rgb offset in pixels

     //// calculate height loop count
     LSR         x5, x5, #1                  //// height_cnt = height / 16

     //// create next row pointers for rgb and luma data
     ADD         x7,x0,x6                    //// luma_next_row = luma + luma_stride
     ADD         x8,x2,x9,LSL #2             //// rgb_next_row = rgb + rgb_stride

 LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:

     ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
     LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF UV
     ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V

     //// calculate width loop count
     LSR         x6, x3, #4                  //// width_cnt = width / 16

     ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
     ////LOAD VALUES OF Y 8-BIT VALUES
     LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
     LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

     SUBS        x6,x6,#1
     BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP

 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
     //VMOV.I8 Q1,#128
     UZP1        v27.8b, v2.8b, v3.8b
     UZP2        v3.8b, v2.8b, v3.8b
     mov         v2.d[0], v27.d[0]

     ////NEED TO SUBTRACT (U-128) AND (V-128)
     ////(D2-D1),(D3-D1)
     uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
     uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)

     ////LOAD VALUES OF U&V for next row
     LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF U
     ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V

     //PLD        [x0]
     prfm        PLDL1KEEP,[x1]

     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
     sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
     sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B

     sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
     sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R

     sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
     sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
     sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
     sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3

     ////NARROW RIGHT SHIFT BY 13 FOR R&B
     sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
     sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
     ////Q4 - WEIGHT FOR B

     ////NARROW RIGHT SHIFT BY 13 FOR R&B
     sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
     sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
     ////Q5 - WEIGHT FOR R

     ////NARROW RIGHT SHIFT BY 13 FOR G
     sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     ////Q6 - WEIGHT FOR G

     UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
     UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G

     UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
     UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G

     sqxtun      v14.8b, v14.8h
     sqxtun      v15.8b, v18.8h
     sqxtun      v16.8b, v16.8h
     movi        v17.8b, #0

     sqxtun      v20.8b, v20.8h
     sqxtun      v21.8b, v24.8h
     sqxtun      v22.8b, v22.8h
     movi        v23.8b, #0

     ZIP1        v27.8b, v14.8b, v15.8b
     ZIP2        v15.8b, v14.8b, v15.8b
     mov         v14.d[0], v27.d[0]
     ZIP1        v27.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.d[0], v27.d[0]

     ZIP1        v27.8b, v20.8b, v21.8b
     ZIP2        v21.8b, v20.8b, v21.8b
     mov         v20.d[0], v27.d[0]
     ZIP1        v27.8b, v22.8b, v23.8b
     ZIP2        v23.8b, v22.8b, v23.8b
     mov         v22.d[0], v27.d[0]

     mov         v14.d[1], v15.d[0]
     mov         v20.d[1], v21.d[0]
     mov         v16.d[1], v17.d[0]
     mov         v22.d[1], v23.d[0]

     ZIP1        v27.8h, v14.8h, v16.8h
     ZIP2        v26.8h, v14.8h, v16.8h

     ZIP1        v25.8h, v20.8h, v22.8h
     ZIP2        v19.8h, v20.8h, v22.8h

     ZIP1        v14.4s, v27.4s, v25.4s
     ZIP2        v20.4s, v27.4s, v25.4s

     ZIP1        v16.4s, v26.4s, v19.4s
     ZIP2        v22.4s, v26.4s, v19.4s

     ST1         {v14.4s},[x2],#16
     ST1         {v20.4s},[x2],#16
     ST1         {v16.4s},[x2],#16
     ST1         {v22.4s},[x2],#16

     ////D14-D20 - TOALLY HAVE 16 VALUES
     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
     UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
     UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G

     UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
     UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G

     ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
     ////LOAD VALUES OF Y 8-BIT VALUES
     LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
     LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                             ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

     prfm        PLDL1KEEP,[x0]
     prfm        PLDL1KEEP,[x7]

     sqxtun      v14.8b, v14.8h
     sqxtun      v15.8b, v18.8h
     sqxtun      v16.8b, v16.8h
     movi        v17.8b, #0

     sqxtun      v20.8b, v20.8h
     sqxtun      v21.8b, v24.8h
     sqxtun      v22.8b, v22.8h
     movi        v23.8b, #0

     ZIP1        v27.8b, v14.8b, v15.8b
     ZIP2        v15.8b, v14.8b, v15.8b
     mov         v14.d[0], v27.d[0]
     ZIP1        v27.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.d[0], v27.d[0]

     ZIP1        v27.8b, v20.8b, v21.8b
     ZIP2        v21.8b, v20.8b, v21.8b
     mov         v20.d[0], v27.d[0]
     ZIP1        v27.8b, v22.8b, v23.8b
     ZIP2        v23.8b, v22.8b, v23.8b
     mov         v22.d[0], v27.d[0]

     mov         v14.d[1], v15.d[0]
     mov         v20.d[1], v21.d[0]
     mov         v16.d[1], v17.d[0]
     mov         v22.d[1], v23.d[0]

     ZIP1        v27.8h, v14.8h, v16.8h
     ZIP2        v26.8h, v14.8h, v16.8h

     ZIP1        v25.8h, v20.8h, v22.8h
     ZIP2        v19.8h, v20.8h, v22.8h

     ZIP1        v14.4s, v27.4s, v25.4s
     ZIP2        v20.4s, v27.4s, v25.4s

     ZIP1        v16.4s, v26.4s, v19.4s
     ZIP2        v22.4s, v26.4s, v19.4s

     ST1         {v14.4s},[x8],#16
     ST1         {v20.4s},[x8],#16
     ST1         {v16.4s},[x8],#16
     ST1         {v22.4s},[x8],#16

     SUBS        x6,x6,#1                    //// width_cnt -= 1
     BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP

 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
     //VMOV.I8 Q1,#128
     UZP1        v27.8b, v2.8b, v3.8b
     UZP2        v3.8b, v2.8b, v3.8b
     mov         v2.d[0], v27.d[0]


     ////NEED TO SUBTRACT (U-128) AND (V-128)
     ////(D2-D1),(D3-D1)
     uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
     uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)


     ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
     sMULL       v5.4s, v4.4h, v0.h[3]      ////(U-128)*C4 FOR B
     sMULL2      v7.4s, v4.8h, v0.h[3]      ////(U-128)*C4 FOR B

     sMULL       v20.4s, v6.4h, v0.h[0]     ////(V-128)*C1 FOR R
     sMULL2      v22.4s, v6.8h, v0.h[0]     ////(V-128)*C1 FOR R

     sMULL       v12.4s, v4.4h, v0.h[1]     ////(U-128)*C2 FOR G
     sMLAL       v12.4s, v6.4h, v0.h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
     sMULL2      v14.4s, v4.8h, v0.h[1]     ////(U-128)*C2 FOR G
     sMLAL2      v14.4s, v6.8h, v0.h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3

     ////NARROW RIGHT SHIFT BY 13 FOR R&B
     sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
     sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
     ////Q4 - WEIGHT FOR B

     ////NARROW RIGHT SHIFT BY 13 FOR R&B
     sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
     sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
     ////Q5 - WEIGHT FOR R

     ////NARROW RIGHT SHIFT BY 13 FOR G
     sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
     ////Q6 - WEIGHT FOR G

     UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
     UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G

     UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
     UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G

     sqxtun      v14.8b, v14.8h
     sqxtun      v15.8b, v18.8h
     sqxtun      v16.8b, v16.8h
     movi        v17.8b, #0

     sqxtun      v20.8b, v20.8h
     sqxtun      v21.8b, v24.8h
     sqxtun      v22.8b, v22.8h
     movi        v23.8b, #0

     ZIP1        v27.8b, v14.8b, v15.8b
     ZIP2        v15.8b, v14.8b, v15.8b
     mov         v14.d[0], v27.d[0]
     ZIP1        v27.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.d[0], v27.d[0]

     ZIP1        v27.8b, v20.8b, v21.8b
     ZIP2        v21.8b, v20.8b, v21.8b
     mov         v20.d[0], v27.d[0]
     ZIP1        v27.8b, v22.8b, v23.8b
     ZIP2        v23.8b, v22.8b, v23.8b
     mov         v22.d[0], v27.d[0]

     mov         v14.d[1], v15.d[0]
     mov         v20.d[1], v21.d[0]
     mov         v16.d[1], v17.d[0]
     mov         v22.d[1], v23.d[0]

     ZIP1        v27.8h, v14.8h, v16.8h
     ZIP2        v26.8h, v14.8h, v16.8h

     ZIP1        v25.8h, v20.8h, v22.8h
     ZIP2        v19.8h, v20.8h, v22.8h

     ZIP1        v14.4s, v27.4s, v25.4s
     ZIP2        v20.4s, v27.4s, v25.4s

     ZIP1        v16.4s, v26.4s, v19.4s
     ZIP2        v22.4s, v26.4s, v19.4s

     ST1         {v14.4s},[x2],#16
     ST1         {v20.4s},[x2],#16
     ST1         {v16.4s},[x2],#16
     ST1         {v22.4s},[x2],#16

     ////D14-D20 - TOALLY HAVE 16 VALUES
     ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
     UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
     UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
     UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G

     UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
     UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
     UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G

     sqxtun      v14.8b, v14.8h
     sqxtun      v15.8b, v18.8h
     sqxtun      v16.8b, v16.8h
     movi        v17.8b, #0

     sqxtun      v20.8b, v20.8h
     sqxtun      v21.8b, v24.8h
     sqxtun      v22.8b, v22.8h
     movi        v23.8b, #0

     ZIP1        v27.8b, v14.8b, v15.8b
     ZIP2        v15.8b, v14.8b, v15.8b
     mov         v14.d[0], v27.d[0]
     ZIP1        v27.8b, v16.8b, v17.8b
     ZIP2        v17.8b, v16.8b, v17.8b
     mov         v16.d[0], v27.d[0]

     ZIP1        v27.8b, v20.8b, v21.8b
     ZIP2        v21.8b, v20.8b, v21.8b
     mov         v20.d[0], v27.d[0]
     ZIP1        v27.8b, v22.8b, v23.8b
     ZIP2        v23.8b, v22.8b, v23.8b
     mov         v22.d[0], v27.d[0]

     mov         v14.d[1], v15.d[0]
     mov         v20.d[1], v21.d[0]
     mov         v16.d[1], v17.d[0]
     mov         v22.d[1], v23.d[0]

     ZIP1        v27.8h, v14.8h, v16.8h
     ZIP2        v26.8h, v14.8h, v16.8h

     ZIP1        v25.8h, v20.8h, v22.8h
     ZIP2        v19.8h, v20.8h, v22.8h

     ZIP1        v14.4s, v27.4s, v25.4s
     ZIP2        v20.4s, v27.4s, v25.4s

     ZIP1        v16.4s, v26.4s, v19.4s
     ZIP2        v22.4s, v26.4s, v19.4s

     ST1         {v14.4s},[x8],#16
     ST1         {v20.4s},[x8],#16
     ST1         {v16.4s},[x8],#16
     ST1         {v22.4s},[x8],#16

     //// Adjust the address pointers
     ADD         x0,x7,x10                   //// luma = luma_next + offset
     ADD         x2,x8,x14,LSL #2            //// rgb = rgb_next + offset

     ADD         x7,x0,x3                    //// luma_next = luma + width
     ADD         x8,x2,x3,LSL #2             //// rgb_next_row = rgb + width

     ADD         x1,x1,x11                   //// adjust u pointer
     //ADD        x2,x2,x12            @// adjust v pointer

     ADD         x7,x7,x10                   //// luma_next = luma + width + offset (because of register crunch)
     ADD         x8,x8,x14,LSL #2            //// rgb_next_row = rgb + width + offset

     SUBS        x5,x5,#1                    //// height_cnt -= 1

     BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP

     ////POP THE REGISTERS
     // LDMFD sp!,{x4-x12,PC}
     ldp         x19, x20,[sp],#16
     ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
                                             // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
     ldp         d12,d14,[sp],#16
     ret


     .section .note.GNU-stack,"",%progbits
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	///*******************************************************************************
	//* //file
	//* ihevcd_fmt_conv_420sp_to_rgba8888.s
	//*
	//* //brief
	//* contains function definitions for format conversions
	//*
	//* //author
	//* ittiam
	//*
	//* //par list of functions:
	//*
	//*
	//* //remarks
	//* none
	//*
	//*******************************************************************************/

	.equ DO1STROUNDING, 0

	// ARM
	//
	// PRESERVE8

	.text
	.p2align 2

	.include "ihevc_neon_macros.s"



	///*****************************************************************************
	//* *
	//* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
	//* *
	//* Description : This function conversts the image from YUV422 color *
	//* space to RGB888 color space. The function can be *
	//* invoked at the MB level. *
	//* *
	//* Arguments : x0 pubY *
	//* x1 pubUV *
	//* x2 pusRGB *
	//* x3 pusRGB *
	//* [x13 #40] usHeight *
	//* [x13 #44] usWidth *
	//* [x13 #48] usStrideY *
	//* [x13 #52] usStrideU *
	//* [x13 #56] usStrideV *
	//* [x13 #60] usStrideRGB *
	//* *
	//* Values Returned : None *
	//* *
	//* Register Usage : x0 - x14 *
	//* *
	//* Stack Usage : 40 Bytes *
	//* *
	//* Interruptibility : Interruptible *
	//* *
	//* Known Limitations *
	//* Assumptions: Image Width: Assumed to be multiple of 16 and *
	//* greater than or equal to 16 *
	//* Image Height: Assumed to be even. *
	//* *
	//* Revision History : *
	//* DD MM YYYY Author(s) Changes (Describe the changes made) *
	//* 07 06 2010 Varshita Draft *
	//* 07 06 2010 Naveen Kr T Completed *
	//* 05 08 2013 Naveen K P Modified for HEVC *
	//*****************************************************************************/
	.global ihevcd_fmt_conv_420sp_to_rgba8888_av8
	.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
	ihevcd_fmt_conv_420sp_to_rgba8888_av8:

	//// push the registers on the stack
	// STMFD sp!,{x4-x12,x14}

	stp d12,d14,[sp,#-16]!
	stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
	// d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
	stp x19, x20,[sp,#-16]!


	////x0 - Y PTR
	////x1 - UV PTR
	////x2 - RGB PTR
	////x3 - RGB PTR
	////x4 - PIC WIDTH
	////x5 - PIC HT
	////x6 - STRIDE Y
	////x7 - STRIDE U
	////x8 - STRIDE V
	////x9 - STRIDE RGB

	////ONE ROW PROCESSING AT A TIME

	////THE FOUR CONSTANTS ARE:
	////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092

	//PLD [x0]
	//PLD [x1]
	//PLD [x2]


	///* can be loaded from a defined const type */
	mov x10,#0x3311
	mov v0.h[0], w10 ////C1

	mov x10,#0xF379
	mov v0.h[1], w10 ////C2

	mov x10,#0xE5F8
	mov v0.h[2], w10 ////C3

	mov x10,#0x4092
	mov v0.h[3], w10 ////C4

	////LOAD CONSTANT 128 INTO A CORTEX REGISTER
	MOV x10,#128
	dup v1.8b,w10

	////D0 HAS C1-C2-C3-C4
	//// load other parameters from stack
	mov x9, x7
	mov x7, x6
	mov x6, x5
	mov x5, x4
	//LDR x4,[sp,#44]
	//LDR x8,[sp,#52]

	//// calculate offsets, offset = stride - width
	SUB x10,x6,x3 //// luma offset
	SUB x11,x7,x3
	//, LSR #1 @// u offset
	//SUB x12,x8,x3, LSR #1 @// v offset
	SUB x14,x9,x3 //// rgb offset in pixels

	//// calculate height loop count
	LSR x5, x5, #1 //// height_cnt = height / 16

	//// create next row pointers for rgb and luma data
	ADD x7,x0,x6 //// luma_next_row = luma + luma_stride
	ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride

	LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:

	////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
	LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV
	////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V

	//// calculate width loop count
	LSR x6, x3, #4 //// width_cnt = width / 16

	////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
	////LOAD VALUES OF Y 8-BIT VALUES
	LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
	////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
	LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
	////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

	SUBS x6,x6,#1
	BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP

	LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
	//VMOV.I8 Q1,#128
	UZP1 v27.8b, v2.8b, v3.8b
	UZP2 v3.8b, v2.8b, v3.8b
	mov v2.d[0], v27.d[0]

	////NEED TO SUBTRACT (U-128) AND (V-128)
	////(D2-D1),(D3-D1)
	uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
	uSUBL v6.8h, v3.8b, v1.8b ////(V-128)

	////LOAD VALUES OF U&V for next row
	LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U
	////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V

	//PLD [x0]
	prfm PLDL1KEEP,[x1]

	////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
	sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
	sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B

	sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
	sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R

	sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
	sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)C2 + (V-128)C3
	sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
	sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)C2 + (V-128)C3

	////NARROW RIGHT SHIFT BY 13 FOR R&B
	sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
	sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
	////Q4 - WEIGHT FOR B

	////NARROW RIGHT SHIFT BY 13 FOR R&B
	sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
	sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
	////Q5 - WEIGHT FOR R

	////NARROW RIGHT SHIFT BY 13 FOR G
	sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)C2 + (V-128)C3]>>13 4 16-BIT VALUES
	sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)C2 + (V-128)C3]>>13 4 16-BIT VALUES
	////Q6 - WEIGHT FOR G

	UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
	UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
	UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G

	UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
	UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
	UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G

	sqxtun v14.8b, v14.8h
	sqxtun v15.8b, v18.8h
	sqxtun v16.8b, v16.8h
	movi v17.8b, #0

	sqxtun v20.8b, v20.8h
	sqxtun v21.8b, v24.8h
	sqxtun v22.8b, v22.8h
	movi v23.8b, #0

	ZIP1 v27.8b, v14.8b, v15.8b
	ZIP2 v15.8b, v14.8b, v15.8b
	mov v14.d[0], v27.d[0]
	ZIP1 v27.8b, v16.8b, v17.8b
	ZIP2 v17.8b, v16.8b, v17.8b
	mov v16.d[0], v27.d[0]

	ZIP1 v27.8b, v20.8b, v21.8b
	ZIP2 v21.8b, v20.8b, v21.8b
	mov v20.d[0], v27.d[0]
	ZIP1 v27.8b, v22.8b, v23.8b
	ZIP2 v23.8b, v22.8b, v23.8b
	mov v22.d[0], v27.d[0]

	mov v14.d[1], v15.d[0]
	mov v20.d[1], v21.d[0]
	mov v16.d[1], v17.d[0]
	mov v22.d[1], v23.d[0]

	ZIP1 v27.8h, v14.8h, v16.8h
	ZIP2 v26.8h, v14.8h, v16.8h

	ZIP1 v25.8h, v20.8h, v22.8h
	ZIP2 v19.8h, v20.8h, v22.8h

	ZIP1 v14.4s, v27.4s, v25.4s
	ZIP2 v20.4s, v27.4s, v25.4s

	ZIP1 v16.4s, v26.4s, v19.4s
	ZIP2 v22.4s, v26.4s, v19.4s

	ST1 {v14.4s},[x2],#16
	ST1 {v20.4s},[x2],#16
	ST1 {v16.4s},[x2],#16
	ST1 {v22.4s},[x2],#16

	////D14-D20 - TOALLY HAVE 16 VALUES
	////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
	UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
	UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
	UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G

	UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
	UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
	UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G

	////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
	////LOAD VALUES OF Y 8-BIT VALUES
	LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
	////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
	LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
	////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

	prfm PLDL1KEEP,[x0]
	prfm PLDL1KEEP,[x7]

	sqxtun v14.8b, v14.8h
	sqxtun v15.8b, v18.8h
	sqxtun v16.8b, v16.8h
	movi v17.8b, #0

	sqxtun v20.8b, v20.8h
	sqxtun v21.8b, v24.8h
	sqxtun v22.8b, v22.8h
	movi v23.8b, #0

	ZIP1 v27.8b, v14.8b, v15.8b
	ZIP2 v15.8b, v14.8b, v15.8b
	mov v14.d[0], v27.d[0]
	ZIP1 v27.8b, v16.8b, v17.8b
	ZIP2 v17.8b, v16.8b, v17.8b
	mov v16.d[0], v27.d[0]

	ZIP1 v27.8b, v20.8b, v21.8b
	ZIP2 v21.8b, v20.8b, v21.8b
	mov v20.d[0], v27.d[0]
	ZIP1 v27.8b, v22.8b, v23.8b
	ZIP2 v23.8b, v22.8b, v23.8b
	mov v22.d[0], v27.d[0]

	mov v14.d[1], v15.d[0]
	mov v20.d[1], v21.d[0]
	mov v16.d[1], v17.d[0]
	mov v22.d[1], v23.d[0]

	ZIP1 v27.8h, v14.8h, v16.8h
	ZIP2 v26.8h, v14.8h, v16.8h

	ZIP1 v25.8h, v20.8h, v22.8h
	ZIP2 v19.8h, v20.8h, v22.8h

	ZIP1 v14.4s, v27.4s, v25.4s
	ZIP2 v20.4s, v27.4s, v25.4s

	ZIP1 v16.4s, v26.4s, v19.4s
	ZIP2 v22.4s, v26.4s, v19.4s

	ST1 {v14.4s},[x8],#16
	ST1 {v20.4s},[x8],#16
	ST1 {v16.4s},[x8],#16
	ST1 {v22.4s},[x8],#16

	SUBS x6,x6,#1 //// width_cnt -= 1
	BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP

	LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
	//VMOV.I8 Q1,#128
	UZP1 v27.8b, v2.8b, v3.8b
	UZP2 v3.8b, v2.8b, v3.8b
	mov v2.d[0], v27.d[0]


	////NEED TO SUBTRACT (U-128) AND (V-128)
	////(D2-D1),(D3-D1)
	uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
	uSUBL v6.8h, v3.8b, v1.8b ////(V-128)


	////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
	sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
	sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B

	sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
	sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R

	sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
	sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)C2 + (V-128)C3
	sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
	sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)C2 + (V-128)C3

	////NARROW RIGHT SHIFT BY 13 FOR R&B
	sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
	sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
	////Q4 - WEIGHT FOR B

	////NARROW RIGHT SHIFT BY 13 FOR R&B
	sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
	sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
	////Q5 - WEIGHT FOR R

	////NARROW RIGHT SHIFT BY 13 FOR G
	sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)C2 + (V-128)C3]>>13 4 16-BIT VALUES
	sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)C2 + (V-128)C3]>>13 4 16-BIT VALUES
	////Q6 - WEIGHT FOR G

	UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
	UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
	UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G

	UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
	UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
	UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G

	sqxtun v14.8b, v14.8h
	sqxtun v15.8b, v18.8h
	sqxtun v16.8b, v16.8h
	movi v17.8b, #0

	sqxtun v20.8b, v20.8h
	sqxtun v21.8b, v24.8h
	sqxtun v22.8b, v22.8h
	movi v23.8b, #0

	ZIP1 v27.8b, v14.8b, v15.8b
	ZIP2 v15.8b, v14.8b, v15.8b
	mov v14.d[0], v27.d[0]
	ZIP1 v27.8b, v16.8b, v17.8b
	ZIP2 v17.8b, v16.8b, v17.8b
	mov v16.d[0], v27.d[0]

	ZIP1 v27.8b, v20.8b, v21.8b
	ZIP2 v21.8b, v20.8b, v21.8b
	mov v20.d[0], v27.d[0]
	ZIP1 v27.8b, v22.8b, v23.8b
	ZIP2 v23.8b, v22.8b, v23.8b
	mov v22.d[0], v27.d[0]

	mov v14.d[1], v15.d[0]
	mov v20.d[1], v21.d[0]
	mov v16.d[1], v17.d[0]
	mov v22.d[1], v23.d[0]

	ZIP1 v27.8h, v14.8h, v16.8h
	ZIP2 v26.8h, v14.8h, v16.8h

	ZIP1 v25.8h, v20.8h, v22.8h
	ZIP2 v19.8h, v20.8h, v22.8h

	ZIP1 v14.4s, v27.4s, v25.4s
	ZIP2 v20.4s, v27.4s, v25.4s

	ZIP1 v16.4s, v26.4s, v19.4s
	ZIP2 v22.4s, v26.4s, v19.4s

	ST1 {v14.4s},[x2],#16
	ST1 {v20.4s},[x2],#16
	ST1 {v16.4s},[x2],#16
	ST1 {v22.4s},[x2],#16

	////D14-D20 - TOALLY HAVE 16 VALUES
	////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
	UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
	UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
	UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G

	UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
	UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
	UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G

	sqxtun v14.8b, v14.8h
	sqxtun v15.8b, v18.8h
	sqxtun v16.8b, v16.8h
	movi v17.8b, #0

	sqxtun v20.8b, v20.8h
	sqxtun v21.8b, v24.8h
	sqxtun v22.8b, v22.8h
	movi v23.8b, #0

	ZIP1 v27.8b, v14.8b, v15.8b
	ZIP2 v15.8b, v14.8b, v15.8b
	mov v14.d[0], v27.d[0]
	ZIP1 v27.8b, v16.8b, v17.8b
	ZIP2 v17.8b, v16.8b, v17.8b
	mov v16.d[0], v27.d[0]

	ZIP1 v27.8b, v20.8b, v21.8b
	ZIP2 v21.8b, v20.8b, v21.8b
	mov v20.d[0], v27.d[0]
	ZIP1 v27.8b, v22.8b, v23.8b
	ZIP2 v23.8b, v22.8b, v23.8b
	mov v22.d[0], v27.d[0]

	mov v14.d[1], v15.d[0]
	mov v20.d[1], v21.d[0]
	mov v16.d[1], v17.d[0]
	mov v22.d[1], v23.d[0]

	ZIP1 v27.8h, v14.8h, v16.8h
	ZIP2 v26.8h, v14.8h, v16.8h

	ZIP1 v25.8h, v20.8h, v22.8h
	ZIP2 v19.8h, v20.8h, v22.8h

	ZIP1 v14.4s, v27.4s, v25.4s
	ZIP2 v20.4s, v27.4s, v25.4s

	ZIP1 v16.4s, v26.4s, v19.4s
	ZIP2 v22.4s, v26.4s, v19.4s

	ST1 {v14.4s},[x8],#16
	ST1 {v20.4s},[x8],#16
	ST1 {v16.4s},[x8],#16
	ST1 {v22.4s},[x8],#16

	//// Adjust the address pointers
	ADD x0,x7,x10 //// luma = luma_next + offset
	ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset

	ADD x7,x0,x3 //// luma_next = luma + width
	ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width

	ADD x1,x1,x11 //// adjust u pointer
	//ADD x2,x2,x12 @// adjust v pointer

	ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch)
	ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset

	SUBS x5,x5,#1 //// height_cnt -= 1

	BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP

	////POP THE REGISTERS
	// LDMFD sp!,{x4-x12,PC}
	ldp x19, x20,[sp],#16
	ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
	// d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
	ldp d12,d14,[sp],#16
	ret




	.section .note.GNU-stack,"",%progbits