blob: a9a75cbe2ade3395d7bb344cd6bbedc483fc3054 [file] [log] [blame]
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/*******************************************************************************
@* @file
@* ihevcd_fmt_conv_420sp_to_rgba8888.s
@*
@* @brief
@* contains function definitions for format conversions
@*
@* @author
@* ittiam
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************/
.equ DO1STROUNDING, 0
@ ARM
@
@ PRESERVE8
.text
.p2align 2
@/*****************************************************************************
@* *
@* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
@* *
@* Description : This function conversts the image from YUV422 color *
@* space to RGB888 color space. The function can be *
@* invoked at the MB level. *
@* *
@* Arguments : R0 pubY *
@* R1 pubUV *
@* R2 pusRGB *
@* R3 pusRGB *
@* [R13 #40] usHeight *
@* [R13 #44] usWidth *
@* [R13 #48] usStrideY *
@* [R13 #52] usStrideU *
@* [R13 #56] usStrideV *
@* [R13 #60] usStrideRGB *
@* *
@* Values Returned : None *
@* *
@* Register Usage : R0 - R14 *
@* *
@* Stack Usage : 40 Bytes *
@* *
@* Interruptibility : Interruptible *
@* *
@* Known Limitations *
@* Assumptions: Image Width: Assumed to be multiple of 16 and *
@* greater than or equal to 16 *
@* Image Height: Assumed to be even. *
@* *
@* Revision History : *
@* DD MM YYYY Author(s) Changes (Describe the changes made) *
@* 07 06 2010 Varshita Draft *
@* 07 06 2010 Naveen Kr T Completed *
@* 05 08 2013 Naveen K P Modified for HEVC *
@*****************************************************************************/
.global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
@// push the registers on the stack
STMFD SP!,{R4-R12,LR}
@//R0 - Y PTR
@//R1 - UV PTR
@//R2 - RGB PTR
@//R3 - RGB PTR
@//R4 - PIC WIDTH
@//R5 - PIC HT
@//R6 - STRIDE Y
@//R7 - STRIDE U
@//R8 - STRIDE V
@//R9 - STRIDE RGB
@//ONE ROW PROCESSING AT A TIME
@//THE FOUR CONSTANTS ARE:
@//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
@PLD [R0]
@PLD [R1]
@PLD [R2]
@/* can be loaded from a defined const type */
MOVW R10,#0x3311
VMOV.16 D0[0],R10 @//C1
MOVW R10,#0xF379
VMOV.16 D0[1],R10 @//C2
MOVW R10,#0xE5F8
VMOV.16 D0[2],R10 @//C3
MOVW R10,#0x4092
VMOV.16 D0[3],R10 @//C4
@//LOAD CONSTANT 128 INTO A CORTEX REGISTER
MOV R10,#128
VDUP.8 D1,R10
@//D0 HAS C1-C2-C3-C4
@// load other parameters from stack
LDR R5,[sp,#40]
@LDR R4,[sp,#44]
LDR R6,[sp,#44]
LDR R7,[sp,#48]
@LDR R8,[sp,#52]
LDR R9,[sp,#52]
@// calculate offsets, offset = stride - width
SUB R10,R6,R3 @// luma offset
SUB R11,R7,R3
@, LSR #1 @// u offset
@SUB R12,R8,R3, LSR #1 @// v offset
SUB R14,R9,R3 @// rgb offset in pixels
@// calculate height loop count
MOV R5,R5, LSR #1 @// height_cnt = height / 16
@// create next row pointers for rgb and luma data
ADD R7,R0,R6 @// luma_next_row = luma + luma_stride
ADD R8,R2,R9,LSL #2 @// rgb_next_row = rgb + rgb_stride
LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
@//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF UV
@//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V
@// calculate width loop count
MOV R6,R3, LSR #4 @// width_cnt = width / 16
@//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@//LOAD VALUES OF Y 8-BIT VALUES
VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
@//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
@//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
SUBS R6,R6,#1
BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
@VMOV.I8 Q1,#128
VUZP.8 D2,D3
@//NEED TO SUBTRACT (U-128) AND (V-128)
@//(D2-D1),(D3-D1)
VSUBL.U8 Q2,D2,D1 @//(U-128)
VSUBL.U8 Q3,D3,D1 @//(V-128)
@//LOAD VALUES OF U&V for next row
VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF U
@//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V
@PLD [R0]
PLD [R1]
@//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B
VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B
VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R
VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R
VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G
VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3
VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G
VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3
@//NARROW RIGHT SHIFT BY 13 FOR R&B
VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
@//Q4 - WEIGHT FOR B
@//NARROW RIGHT SHIFT BY 13 FOR R&B
VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
@//Q5 - WEIGHT FOR R
@//NARROW RIGHT SHIFT BY 13 FOR G
VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
@//Q6 - WEIGHT FOR G
VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B
VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R
VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G
VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B
VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R
VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G
VQMOVUN.S16 D14,Q7
VQMOVUN.S16 D15,Q9
VQMOVUN.S16 D16,Q8
VMOV.I8 D17,#0
VZIP.8 D14,D15
VZIP.8 D16,D17
VZIP.16 Q7,Q8
VQMOVUN.S16 D20,Q10
VQMOVUN.S16 D21,Q12
VQMOVUN.S16 D22,Q11
VMOV.I8 D23,#0
VZIP.8 D20,D21
VZIP.8 D22,D23
VZIP.16 Q10,Q11
VZIP.32 Q7,Q10
VZIP.32 Q8,Q11
VST1.32 D14,[R2]!
VST1.32 D15,[R2]!
VST1.32 D20,[R2]!
VST1.32 D21,[R2]!
VST1.32 D16,[R2]!
VST1.32 D17,[R2]!
VST1.32 D22,[R2]!
VST1.32 D23,[R2]!
@//D14-D20 - TOALLY HAVE 16 VALUES
@//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B
VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R
VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G
VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B
VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R
VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G
@//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
@//LOAD VALUES OF Y 8-BIT VALUES
VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
@//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
@//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
PLD [R0]
PLD [R7]
VQMOVUN.S16 D14,Q7
VQMOVUN.S16 D15,Q9
VQMOVUN.S16 D16,Q8
VMOV.I8 D17,#0
VZIP.8 D14,D15
VZIP.8 D16,D17
VZIP.16 Q7,Q8
VQMOVUN.S16 D20,Q10
VQMOVUN.S16 D21,Q12
VQMOVUN.S16 D22,Q11
VMOV.I8 D23,#0
VZIP.8 D20,D21
VZIP.8 D22,D23
VZIP.16 Q10,Q11
VZIP.32 Q7,Q10
VZIP.32 Q8,Q11
VST1.32 D14,[R8]!
VST1.32 D15,[R8]!
VST1.32 D20,[R8]!
VST1.32 D21,[R8]!
VST1.32 D16,[R8]!
VST1.32 D17,[R8]!
VST1.32 D22,[R8]!
VST1.32 D23,[R8]!
SUBS R6,R6,#1 @// width_cnt -= 1
BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
@VMOV.I8 Q1,#128
VUZP.8 D2,D3
@//NEED TO SUBTRACT (U-128) AND (V-128)
@//(D2-D1),(D3-D1)
VSUBL.U8 Q2,D2,D1 @//(U-128)
VSUBL.U8 Q3,D3,D1 @//(V-128)
@//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B
VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B
VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R
VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R
VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G
VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3
VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G
VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3
@//NARROW RIGHT SHIFT BY 13 FOR R&B
VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
@//Q4 - WEIGHT FOR B
@//NARROW RIGHT SHIFT BY 13 FOR R&B
VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
@//Q5 - WEIGHT FOR R
@//NARROW RIGHT SHIFT BY 13 FOR G
VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
@//Q6 - WEIGHT FOR G
VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B
VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R
VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G
VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B
VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R
VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G
VQMOVUN.S16 D14,Q7
VQMOVUN.S16 D15,Q9
VQMOVUN.S16 D16,Q8
VMOV.I8 D17,#0
VZIP.8 D14,D15
VZIP.8 D16,D17
VZIP.16 Q7,Q8
VQMOVUN.S16 D20,Q10
VQMOVUN.S16 D21,Q12
VQMOVUN.S16 D22,Q11
VMOV.I8 D23,#0
VZIP.8 D20,D21
VZIP.8 D22,D23
VZIP.16 Q10,Q11
VZIP.32 Q7,Q10
VZIP.32 Q8,Q11
VST1.32 D14,[R2]!
VST1.32 D15,[R2]!
VST1.32 D20,[R2]!
VST1.32 D21,[R2]!
VST1.32 D16,[R2]!
VST1.32 D17,[R2]!
VST1.32 D22,[R2]!
VST1.32 D23,[R2]!
@//D14-D20 - TOALLY HAVE 16 VALUES
@//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B
VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R
VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G
VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B
VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R
VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G
VQMOVUN.S16 D14,Q7
VQMOVUN.S16 D15,Q9
VQMOVUN.S16 D16,Q8
VMOV.I8 D17,#0
VZIP.8 D14,D15
VZIP.8 D16,D17
VZIP.16 Q7,Q8
VQMOVUN.S16 D20,Q10
VQMOVUN.S16 D21,Q12
VQMOVUN.S16 D22,Q11
VMOV.I8 D23,#0
VZIP.8 D20,D21
VZIP.8 D22,D23
VZIP.16 Q10,Q11
VZIP.32 Q7,Q10
VZIP.32 Q8,Q11
VST1.32 D14,[R8]!
VST1.32 D15,[R8]!
VST1.32 D20,[R8]!
VST1.32 D21,[R8]!
VST1.32 D16,[R8]!
VST1.32 D17,[R8]!
VST1.32 D22,[R8]!
VST1.32 D23,[R8]!
@// Adjust the address pointers
ADD R0,R7,R10 @// luma = luma_next + offset
ADD R2,R8,R14,LSL #2 @// rgb = rgb_next + offset
ADD R7,R0,R3 @// luma_next = luma + width
ADD R8,R2,R3,LSL #2 @// rgb_next_row = rgb + width
ADD R1,R1,R11 @// adjust u pointer
@ADD R2,R2,R12 @// adjust v pointer
ADD R7,R7,R10 @// luma_next = luma + width + offset (because of register crunch)
ADD R8,R8,R14,LSL #2 @// rgb_next_row = rgb + width + offset
SUBS R5,R5,#1 @// height_cnt -= 1
BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
@//POP THE REGISTERS
LDMFD SP!,{R4-R12,PC}
.section .note.GNU-stack,"",%progbits