blob: 08821f575aa7c48c9a0edc1a0caca1c71139304c [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@/**
@*******************************************************************************
@* @file
@* ih264_resi_trans_a9.s
@*
@* @brief
@* Contains function definitions for residual and forward trans
@*
@* @author
@* Ittiam
@*
@* @par List of Functions:
@* ih264_resi_trans_4x4_a9
@* ih264_resi_trans_8x8_a9
@* @remarks
@* None
@*
@*******************************************************************************
.text
.p2align 2
@*****************************************************************************
@*
@* Function Name : ih264_resi_trans_4x4_a9
@* Description : This function does cf4 of H264 followed by and approximate scaling
@*
@* Arguments :
@ R0 :pointer to src buffer
@ R1 :pointer to pred buffer
@ R2 :pointer to dst buffer
@ R3 :src_stride
@ STACk :pred_stride,dst_stride
@* Values Returned : NONE
@*
@* Register Usage :
@* Stack Usage :
@* Cycles : Around
@* Interruptiaility : Interruptable
@*
@* Known Limitations
@* \Assumptions :
@*
@* Revision History :
@* DD MM YYYY Author(s) Changes
@* 30 12 2009 100633 First version
@*
@*****************************************************************************
.global ih264_resi_trans_4x4_a9
.extern g_scal_coff_h264_4x4
g_scal_coff_h264_4x4_addr:
.long g_scal_coff_h264_4x4 - 4x4lbl - 8
ih264_resi_trans_4x4_a9:
@R0 :pointer to src buffer
@R1 :pointer to pred buffer
@R2 :pointer to dst buffer
@R3 :src_stride
@STACk :pred_stride,dst_stride
push {r4-r12, lr} @push all the variables first
mov r6, sp
add r6, r6, #40 @decrement stack pointer,to accomodate two variables
ldmfd r6, {r4-r5} @load the strides into registers
@R4 pred_stride
@R5 dst_stride
@we have to give the stride as post inrement in VLDR1
@but since thr stride is from end of row 1 to start of row 2,
@we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
@ADD R3,#4
@ADD R4,#4
@ADD R5,#4
@in case of dst the stride represnts 16 bit ie 2*8bits
@hence we need to add #4 to it and thenm multiply by 2
@--------------------function loading done------------------------
@lets find residual
@data is like 1a -> d0[1:31] d0[32:64]
@ a b c d # # # #
vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
@ data is like 1a -> q4[1:63] q4[64:148]
@ d8[1:63] d9[1:63]
@ a b c d # # # #
vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0]
vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0]
vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0]
vsubl.u8 q0, d30, d31 @curr - pred for row one
vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0]
vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0
vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0]
vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0]
vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2]
lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values
ldr r6, g_scal_coff_h264_4x4_addr
4x4lbl:
add r6, r6, pc @ load the address of global array
vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6
@after this
@D0 -> 1a
@D2 -> 2a
@D4 -> 3a
@D6 -> 4a
@transpose the matrix so that we can do the horizontal transform first
@#1 #2 #3 #4
@a b c d ---- D0
@e f g h -----D2
@i j k l -----D4
@m n o p -----D6
@transpose the inner 2x2 blocks
vtrn.16 d0, d2
vld1.s16 {q10}, [r6]! @ load the scaling values 0-7;
vtrn.16 d4, d6
@a e c g
@b f d h
@i m k o
@j n l p
vtrn.32 d0, d4
vtrn.32 d2, d6
@a e i m #1 -- D0 --- x4
@b f j n #2 -- D2 --- x5
@c g k o #3 -- D4 ----x6
@d h l p #4 -- D6 ----x7
@we have loaded the residuals into the registers , now we need to add and subtract them
@let us do the horiz transform first
vsub.s16 d5, d2, d4 @x2 = x5-x6
vsub.s16 d7, d0, d6 @x3 = x4-x7;
vadd.s16 d3, d2, d4 @x1 = x5+x6
vadd.s16 d1, d0, d6 @x0 = x4+x7
vshl.s16 d31, d7, #1 @
vshl.s16 d30, d5, #1 @
vadd.s16 d0, d1, d3 @x0 + x1;
vsub.s16 d4, d1, d3 @x0 - x1;
vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2;
vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft);
@taking transform again so as to make do vert transform
vtrn.16 d0, d2
vtrn.16 d4, d6
vtrn.32 d0, d4
vtrn.32 d2, d6
@let us do vertical transform
@same code as horiz
vadd.s16 d1, d0, d6 @x0 = x4+x7
vadd.s16 d3, d2, d4 @x1 = x5+x6
vsub.s16 d7, d0, d6 @x3 = x4-x7;
vsub.s16 d5, d2, d4 @x2 = x5-x6
@Since we are going to do scal / quant or whatever, we are going to divide by
@a 32 bit number. So we have to expand the values
@VADDL.S16 Q12,D1,D3;x0 + x1
@VSUBL.S16 Q14,D1,D3;x0 - x1
@VSHL.S16 D8,D5,#1;
@VSHL.S16 D9,D7,#1;
@VADDL.S16 Q13,D9,D5 ; + x2
@VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
@scaling follows
@now we need to do the scaling,so load the scaling matrix
@mutliplying by the scaling coeffient; store the results from q5-q8 ;
vadd.s16 d24, d3, d1 @x4 = x0 + x1
vsub.s16 d28, d1, d3 @x6 = x0 - x1
vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft)
vmull.s16 q4, d24, d20 @x4*s0
vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft)
vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2
vmull.s16 q5, d26, d21 @x5*s1
vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride
vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients
vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft)
vmull.s16 q6, d28, d20 @x6*s2
vst1.s32 {q5}, [r2], r5
vmull.s16 q7, d30, d21 @x7*s3
vst1.s32 {q6}, [r2], r5
vst1.s32 {q7}, [r2]
pop {r4-r12, pc} @pop back all variables
@*****************************************************************************
@* Function Name : ih264_resi_trans_8x8_a9
@* Description : This function does cf8 followd by an approximate normalization of H264
@*
@* Arguments :
@* R0 :pointer to src buffer
@ R1 :pointer to pred buffer
@ R2 :pointer to dst buffer
@ R3 :src_stride
@ STACk :pred_stride,dst_st
@*
@*
@* Values Returned : NONE
@*
@* Register Usage :
@* Stack Usage :
@* Cycles : Around
@* Interruptiaility : Interruptable
@*
@* Known Limitations
@* \Assumptions :
@*
@* Revision History :
@* DD MM YYYY Author(s) Changes
@* 30 12 2009 100633 First version
@*
@*****************************************************************************
.global ih264_resi_trans_8x8_a9
.extern g_scal_coff_h264_8x8
g_scal_coff_h264_8x8_addr:
.long g_scal_coff_h264_8x8 - 8x8lbl - 8
ih264_resi_trans_8x8_a9:
@R0 :pointer to src buffer
@R1 :pointer to pred buffer
@R2 :pointer to dst buffer
@R3 :src_stride
@STACk :pred_stride,dst_stride
push {r4-r12, lr} @push all the variables first
mov r6, sp
add r6, r6, #40 @decrement stack pointer,to accomodate two variables
ldmfd r6, {r4-r5} @load the strides into registers
@R4 pred_stride
@R5 dst_stride
@we have to give the stride as post inrement in vst1
@in case of dst the stride represnts 16 bit ie 2*8bits
@hence we need to add #4 to it and thenm multiply by 2
@--------------------function loading done------------------------
@lets find residual
@data is like 1a -> d0[1:31] d0[32:64]
@ a b c d # # # #
vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
vld1.u8 d28, [r0], r3 @src rw2
vld1.u8 d29, [r1], r4 @pred rw2
vsubl.u8 q0, d30, d31 @src-pred rw1
vld1.u8 d26, [r0], r3
vld1.u8 d27, [r1], r4
vsubl.u8 q1, d28, d29
vld1.u8 d24, [r0], r3
vld1.u8 d25, [r1], r4
vsubl.u8 q2, d26, d27
vld1.u8 d22, [r0], r3
vld1.u8 d23, [r1], r4
vsubl.u8 q3, d24, d25
vld1.u8 d20, [r0], r3
vld1.u8 d21, [r1], r4
vsubl.u8 q4, d22, d23
vld1.u8 d18, [r0], r3
vld1.u8 d19, [r1], r4
vsubl.u8 q5, d20, d21
vld1.u8 d16, [r0], r3
vld1.u8 d17, [r1], r4
vsubl.u8 q6, d18, d19
lsl r5, r5, #2
vsubl.u8 q7, d16, d17
@after this
@Q0 -> 1a
@Q1 -> 2a
@Q2 -> 3a
@Q3 -> 4a
@Q4 -> 5a
@Q5 -> 6a
@Q6 -> 7a
@Q7 -> 8a
@transpose the matrix so that we can do the horizontal transform first
@transpose the inner 2x2 blocks
vtrn.16 q0, q1
vtrn.16 q2, q3
vtrn.16 q4, q5
vtrn.16 q6, q7
@transpose the inner 4x4 blocks
vtrn.32 q0, q2
vtrn.32 q1, q3
vtrn.32 q4, q6
vtrn.32 q5, q7
@transpose the outer 8x8 blocks
vswp d1, d8
vswp d7, d14
vswp d3, d10
vswp d5, d12
@transpose done
@@this point we will have data in Q0-Q7
@Q7 will be populated within 2 clock cycle
@all others are availabe @ this clock cycle
@we have loaded the residuals into the registers , now we need to add and subtract them
@let us do the horiz transform first
vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
ldr r6, g_scal_coff_h264_8x8_addr
8x8lbl:
add r6, r6, pc @ load the address of global array
vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
vadd.s16 q2, q5, q8 @
vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
vsub.s16 q6, q9, q7 @
@do not change Q0,Q2.Q4,Q6 they contain results
@Q1,Q3,Q5,Q7 TO STORE RESULTS
@Q8 Q9 Q10 Q11 USE @WILL
vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
@------------horiz transform done-------------------------
@results are in Q0-Q7
@all other neon registes can be used at will
@doing vertical transform
@code exact copy of horiz transform above
@transpose the inner 2x2 blocks
vtrn.16 q0, q1
vtrn.16 q2, q3
vtrn.16 q4, q5
vtrn.16 q6, q7
@transpose the inner 4x4 blocks
vtrn.32 q0, q2
vtrn.32 q1, q3
vtrn.32 q4, q6
vtrn.32 q5, q7
@transpose the outer 8x8 blocks
vswp d1, d8
vswp d3, d10
vswp d5, d12
vswp d7, d14
@transpose done
vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
@DSHIFT_TO_0 Q8,Q7,#1,#0
vadd.s16 q2, q5, q8 @
vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
vsub.s16 q6, q9, q7 @
@do not change Q0,Q2.Q4,Q6 they contain results
@Q1,Q3,Q5,Q7 TO STORE RESULTS
@Q8 Q9 Q10 Q11 USE @WILL
vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
@------------vert transform done-------------------------
@results are in Q0-Q7
@all other neon registes can be used at will
@scaling
@since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
@we need only load 4 values for each row and in total 4 rows
vld1.s16 {q14-q15}, [r6] @
@since we need to get a 32 bit o/p for two 16 bit multiplications
@we need a VMULL instruction
@-----------------------------first and second row
vmull.s16 q8, d0, d28 @scale the first row first 4 elem
vmull.s16 q9, d28, d1 @scale the second row last 4 elemts
vmull.s16 q10, d2, d29 @ scale second row first 4 elem
vmull.s16 q11, d29, d3 @scale the second row last 4 elem
vmull.s16 q12, d4, d30 @scale third row first 4 elem
vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete
vmull.s16 q13, d30, d5 @scale the third row last 4 elem
vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem
vst1.s32 {q10, q11}, [r2], r5 @store the second row complete
@------------------------------- 3rd and 4th row
vmull.s16 q9, d31, d7 @scale the fourth row second column
vst1.s32 {q12, q13}, [r2], r5 @store the third row complete
vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms
vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems
vmull.s16 q12, d10, d29 @scale the 6th row first4 elements
vst1.s32 {q8, q9}, [r2], r5 @store fifth row
@--------------------------------5th and 6th row
vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems
vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms
vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements
vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms
vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms
vst1.s32 {q12, q13}, [r2], r5 @store 6th row
@----------------------------------7th and 8th row
vmull.s16 q11, d31, d15 @scale 8th row second 4 elms
vst1.s32 {q8, q9}, [r2], r5 @store 7th row
vst1.s32 {q10, q11}, [r2], r5 @store 8th row
@----------------------------------done writing
pop {r4-r12, pc} @pop back all variables