blob: 026b65ff434127522c86a49acf046b7ea8f12d55 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//* ihevcd_fmt_conv_420sp_to_rgba8888.s
//*
//* //brief
//* contains function definitions for format conversions
//*
//* //author
//* ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************/
.equ DO1STROUNDING, 0
// ARM
//
// PRESERVE8
.text
.p2align 2
.include "ihevc_neon_macros.s"
///*****************************************************************************
//* *
//* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
//* *
//* Description : This function conversts the image from YUV422 color *
//* space to RGB888 color space. The function can be *
//* invoked at the MB level. *
//* *
//* Arguments : x0 pubY *
//* x1 pubUV *
//* x2 pusRGB *
//* x3 pusRGB *
//* [x13 #40] usHeight *
//* [x13 #44] usWidth *
//* [x13 #48] usStrideY *
//* [x13 #52] usStrideU *
//* [x13 #56] usStrideV *
//* [x13 #60] usStrideRGB *
//* *
//* Values Returned : None *
//* *
//* Register Usage : x0 - x14 *
//* *
//* Stack Usage : 40 Bytes *
//* *
//* Interruptibility : Interruptible *
//* *
//* Known Limitations *
//* Assumptions: Image Width: Assumed to be multiple of 16 and *
//* greater than or equal to 16 *
//* Image Height: Assumed to be even. *
//* *
//* Revision History : *
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
//* 07 06 2010 Varshita Draft *
//* 07 06 2010 Naveen Kr T Completed *
//* 05 08 2013 Naveen K P Modified for HEVC *
//*****************************************************************************/
.global ihevcd_fmt_conv_420sp_to_rgba8888_av8
.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
ihevcd_fmt_conv_420sp_to_rgba8888_av8:
//// push the registers on the stack
// STMFD sp!,{x4-x12,x14}
stp d12,d14,[sp,#-16]!
stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
// d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
stp x19, x20,[sp,#-16]!
////x0 - Y PTR
////x1 - UV PTR
////x2 - RGB PTR
////x3 - RGB PTR
////x4 - PIC WIDTH
////x5 - PIC HT
////x6 - STRIDE Y
////x7 - STRIDE U
////x8 - STRIDE V
////x9 - STRIDE RGB
////ONE ROW PROCESSING AT A TIME
////THE FOUR CONSTANTS ARE:
////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
//PLD [x0]
//PLD [x1]
//PLD [x2]
///* can be loaded from a defined const type */
mov x10,#0x3311
mov v0.h[0], w10 ////C1
mov x10,#0xF379
mov v0.h[1], w10 ////C2
mov x10,#0xE5F8
mov v0.h[2], w10 ////C3
mov x10,#0x4092
mov v0.h[3], w10 ////C4
////LOAD CONSTANT 128 INTO A CORTEX REGISTER
MOV x10,#128
dup v1.8b,w10
////D0 HAS C1-C2-C3-C4
//// load other parameters from stack
mov x9, x7
mov x7, x6
mov x6, x5
mov x5, x4
//LDR x4,[sp,#44]
//LDR x8,[sp,#52]
//// calculate offsets, offset = stride - width
SUB x10,x6,x3 //// luma offset
SUB x11,x7,x3
//, LSR #1 @// u offset
//SUB x12,x8,x3, LSR #1 @// v offset
SUB x14,x9,x3 //// rgb offset in pixels
//// calculate height loop count
LSR x5, x5, #1 //// height_cnt = height / 16
//// create next row pointers for rgb and luma data
ADD x7,x0,x6 //// luma_next_row = luma + luma_stride
ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride
LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV
////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
//// calculate width loop count
LSR x6, x3, #4 //// width_cnt = width / 16
////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
////LOAD VALUES OF Y 8-BIT VALUES
LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
SUBS x6,x6,#1
BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
//VMOV.I8 Q1,#128
UZP1 v27.8b, v2.8b, v3.8b
UZP2 v3.8b, v2.8b, v3.8b
mov v2.d[0], v27.d[0]
////NEED TO SUBTRACT (U-128) AND (V-128)
////(D2-D1),(D3-D1)
uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
////LOAD VALUES OF U&V for next row
LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U
////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
//PLD [x0]
prfm PLDL1KEEP,[x1]
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R
sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
sqxtun v15.8b, v18.8h
sqxtun v16.8b, v16.8h
movi v17.8b, #0
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v24.8h
sqxtun v22.8b, v22.8h
movi v23.8b, #0
ZIP1 v27.8b, v14.8b, v15.8b
ZIP2 v15.8b, v14.8b, v15.8b
mov v14.d[0], v27.d[0]
ZIP1 v27.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.d[0], v27.d[0]
ZIP1 v27.8b, v20.8b, v21.8b
ZIP2 v21.8b, v20.8b, v21.8b
mov v20.d[0], v27.d[0]
ZIP1 v27.8b, v22.8b, v23.8b
ZIP2 v23.8b, v22.8b, v23.8b
mov v22.d[0], v27.d[0]
mov v14.d[1], v15.d[0]
mov v20.d[1], v21.d[0]
mov v16.d[1], v17.d[0]
mov v22.d[1], v23.d[0]
ZIP1 v27.8h, v14.8h, v16.8h
ZIP2 v26.8h, v14.8h, v16.8h
ZIP1 v25.8h, v20.8h, v22.8h
ZIP2 v19.8h, v20.8h, v22.8h
ZIP1 v14.4s, v27.4s, v25.4s
ZIP2 v20.4s, v27.4s, v25.4s
ZIP1 v16.4s, v26.4s, v19.4s
ZIP2 v22.4s, v26.4s, v19.4s
ST1 {v14.4s},[x2],#16
ST1 {v20.4s},[x2],#16
ST1 {v16.4s},[x2],#16
ST1 {v22.4s},[x2],#16
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
////LOAD VALUES OF Y 8-BIT VALUES
LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
prfm PLDL1KEEP,[x0]
prfm PLDL1KEEP,[x7]
sqxtun v14.8b, v14.8h
sqxtun v15.8b, v18.8h
sqxtun v16.8b, v16.8h
movi v17.8b, #0
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v24.8h
sqxtun v22.8b, v22.8h
movi v23.8b, #0
ZIP1 v27.8b, v14.8b, v15.8b
ZIP2 v15.8b, v14.8b, v15.8b
mov v14.d[0], v27.d[0]
ZIP1 v27.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.d[0], v27.d[0]
ZIP1 v27.8b, v20.8b, v21.8b
ZIP2 v21.8b, v20.8b, v21.8b
mov v20.d[0], v27.d[0]
ZIP1 v27.8b, v22.8b, v23.8b
ZIP2 v23.8b, v22.8b, v23.8b
mov v22.d[0], v27.d[0]
mov v14.d[1], v15.d[0]
mov v20.d[1], v21.d[0]
mov v16.d[1], v17.d[0]
mov v22.d[1], v23.d[0]
ZIP1 v27.8h, v14.8h, v16.8h
ZIP2 v26.8h, v14.8h, v16.8h
ZIP1 v25.8h, v20.8h, v22.8h
ZIP2 v19.8h, v20.8h, v22.8h
ZIP1 v14.4s, v27.4s, v25.4s
ZIP2 v20.4s, v27.4s, v25.4s
ZIP1 v16.4s, v26.4s, v19.4s
ZIP2 v22.4s, v26.4s, v19.4s
ST1 {v14.4s},[x8],#16
ST1 {v20.4s},[x8],#16
ST1 {v16.4s},[x8],#16
ST1 {v22.4s},[x8],#16
SUBS x6,x6,#1 //// width_cnt -= 1
BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
//VMOV.I8 Q1,#128
UZP1 v27.8b, v2.8b, v3.8b
UZP2 v3.8b, v2.8b, v3.8b
mov v2.d[0], v27.d[0]
////NEED TO SUBTRACT (U-128) AND (V-128)
////(D2-D1),(D3-D1)
uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
sMULL v5.4s, v4.4h, v0.h[3] ////(U-128)*C4 FOR B
sMULL2 v7.4s, v4.8h, v0.h[3] ////(U-128)*C4 FOR B
sMULL v20.4s, v6.4h, v0.h[0] ////(V-128)*C1 FOR R
sMULL2 v22.4s, v6.8h, v0.h[0] ////(V-128)*C1 FOR R
sMULL v12.4s, v4.4h, v0.h[1] ////(U-128)*C2 FOR G
sMLAL v12.4s, v6.4h, v0.h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
sMULL2 v14.4s, v4.8h, v0.h[1] ////(U-128)*C2 FOR G
sMLAL2 v14.4s, v6.8h, v0.h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v5.4h, v5.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
sqshrn2 v5.8h, v7.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
////Q4 - WEIGHT FOR B
////NARROW RIGHT SHIFT BY 13 FOR R&B
sqshrn v7.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
sqshrn2 v7.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
////Q5 - WEIGHT FOR R
////NARROW RIGHT SHIFT BY 13 FOR G
sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
////Q6 - WEIGHT FOR G
UADDW v14.8h, v5.8h , v30.8b ////Q7 - HAS Y + B
UADDW v16.8h, v7.8h , v30.8b ////Q8 - HAS Y + R
UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
UADDW v20.8h, v5.8h , v31.8b ////Q10 - HAS Y + B
UADDW v22.8h, v7.8h , v31.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
sqxtun v15.8b, v18.8h
sqxtun v16.8b, v16.8h
movi v17.8b, #0
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v24.8h
sqxtun v22.8b, v22.8h
movi v23.8b, #0
ZIP1 v27.8b, v14.8b, v15.8b
ZIP2 v15.8b, v14.8b, v15.8b
mov v14.d[0], v27.d[0]
ZIP1 v27.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.d[0], v27.d[0]
ZIP1 v27.8b, v20.8b, v21.8b
ZIP2 v21.8b, v20.8b, v21.8b
mov v20.d[0], v27.d[0]
ZIP1 v27.8b, v22.8b, v23.8b
ZIP2 v23.8b, v22.8b, v23.8b
mov v22.d[0], v27.d[0]
mov v14.d[1], v15.d[0]
mov v20.d[1], v21.d[0]
mov v16.d[1], v17.d[0]
mov v22.d[1], v23.d[0]
ZIP1 v27.8h, v14.8h, v16.8h
ZIP2 v26.8h, v14.8h, v16.8h
ZIP1 v25.8h, v20.8h, v22.8h
ZIP2 v19.8h, v20.8h, v22.8h
ZIP1 v14.4s, v27.4s, v25.4s
ZIP2 v20.4s, v27.4s, v25.4s
ZIP1 v16.4s, v26.4s, v19.4s
ZIP2 v22.4s, v26.4s, v19.4s
ST1 {v14.4s},[x2],#16
ST1 {v20.4s},[x2],#16
ST1 {v16.4s},[x2],#16
ST1 {v22.4s},[x2],#16
////D14-D20 - TOALLY HAVE 16 VALUES
////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
UADDW v14.8h, v5.8h , v28.8b ////Q7 - HAS Y + B
UADDW v16.8h, v7.8h , v28.8b ////Q2 - HAS Y + R
UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
UADDW v20.8h, v5.8h , v29.8b ////Q10 - HAS Y + B
UADDW v22.8h, v7.8h , v29.8b ////Q11 - HAS Y + R
UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
sqxtun v14.8b, v14.8h
sqxtun v15.8b, v18.8h
sqxtun v16.8b, v16.8h
movi v17.8b, #0
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v24.8h
sqxtun v22.8b, v22.8h
movi v23.8b, #0
ZIP1 v27.8b, v14.8b, v15.8b
ZIP2 v15.8b, v14.8b, v15.8b
mov v14.d[0], v27.d[0]
ZIP1 v27.8b, v16.8b, v17.8b
ZIP2 v17.8b, v16.8b, v17.8b
mov v16.d[0], v27.d[0]
ZIP1 v27.8b, v20.8b, v21.8b
ZIP2 v21.8b, v20.8b, v21.8b
mov v20.d[0], v27.d[0]
ZIP1 v27.8b, v22.8b, v23.8b
ZIP2 v23.8b, v22.8b, v23.8b
mov v22.d[0], v27.d[0]
mov v14.d[1], v15.d[0]
mov v20.d[1], v21.d[0]
mov v16.d[1], v17.d[0]
mov v22.d[1], v23.d[0]
ZIP1 v27.8h, v14.8h, v16.8h
ZIP2 v26.8h, v14.8h, v16.8h
ZIP1 v25.8h, v20.8h, v22.8h
ZIP2 v19.8h, v20.8h, v22.8h
ZIP1 v14.4s, v27.4s, v25.4s
ZIP2 v20.4s, v27.4s, v25.4s
ZIP1 v16.4s, v26.4s, v19.4s
ZIP2 v22.4s, v26.4s, v19.4s
ST1 {v14.4s},[x8],#16
ST1 {v20.4s},[x8],#16
ST1 {v16.4s},[x8],#16
ST1 {v22.4s},[x8],#16
//// Adjust the address pointers
ADD x0,x7,x10 //// luma = luma_next + offset
ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset
ADD x7,x0,x3 //// luma_next = luma + width
ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width
ADD x1,x1,x11 //// adjust u pointer
//ADD x2,x2,x12 @// adjust v pointer
ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch)
ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset
SUBS x5,x5,#1 //// height_cnt -= 1
BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
////POP THE REGISTERS
// LDMFD sp!,{x4-x12,PC}
ldp x19, x20,[sp],#16
ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
// d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
ldp d12,d14,[sp],#16
ret
.section .note.GNU-stack,"",%progbits