common/arm/ihevc_itrans_recon_8x8.s - platform/external/libhevc - Git at Google

 @/*****************************************************************************
 @*
 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 @*
 @* Licensed under the Apache License, Version 2.0 (the "License");
 @* you may not use this file except in compliance with the License.
 @* You may obtain a copy of the License at:
 @*
 @* http://www.apache.org/licenses/LICENSE-2.0
 @*
 @* Unless required by applicable law or agreed to in writing, software
 @* distributed under the License is distributed on an "AS IS" BASIS,
 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @* See the License for the specific language governing permissions and
 @* limitations under the License.
 @*
 @*****************************************************************************/
 @/**
 @ *******************************************************************************
 @ * @file
 @ *  ihevc_itrans_recon_8x8_neon.s
 @ *
 @ * @brief
 @ *  contains function definitions for single stage  inverse transform
 @ *
 @ * @author
 @ *  anand s
 @ *
 @ * @par list of functions:
 @ *  - ihevc_itrans_recon_8x8()
 @ *
 @ * @remarks
 @ *  none
 @ *
 @ *******************************************************************************
 @*/

 @/**
 @ *******************************************************************************
 @ *
 @ * @brief
 @ *  this function performs inverse transform  and reconstruction for 8x8
 @ * input block
 @ *
 @ * @par description:
 @ *  performs inverse transform and adds the prediction  data and clips output
 @ * to 8 bit
 @ *
 @ * @param[in] pi2_src
 @ *  input 8x8 coefficients
 @ *
 @ * @param[in] pi2_tmp
 @ *  temporary 8x8 buffer for storing inverse
 @ *
 @ *  transform
 @ *  1st stage output
 @ *
 @ * @param[in] pu1_pred
 @ *  prediction 8x8 block
 @ *
 @ * @param[out] pu1_dst
 @ *  output 8x8 block
 @ *
 @ * @param[in] src_strd
 @ *  input stride
 @ *
 @ * @param[in] pred_strd
 @ *  prediction stride
 @ *
 @ * @param[in] dst_strd
 @ *  output stride
 @ *
 @ * @param[in] shift
 @ *  output shift
 @ *
 @ * @param[in] zero_cols
 @ *  zero columns in pi2_src
 @ *
 @ * @returns  void
 @ *
 @ * @remarks
 @ *  none
 @ *
 @ *******************************************************************************
 @ */

 @void ihevc_itrans_recon_8x8(word16 *pi2_src,
 @                            word16 *pi2_tmp,
 @                            uword8 *pu1_pred,
 @                            uword8 *pu1_dst,
 @                            word32 src_strd,
 @                            word32 pred_strd,
 @                            word32 dst_strd,
 @                            word32 zero_cols
 @                            word32 zero_rows               )

 @**************variables vs registers*************************
 @   r0 => *pi2_src
 @   r1 => *pi2_tmp
 @   r2 => *pu1_pred
 @   r3 => *pu1_dst
 @   src_strd
 @   pred_strd
 @   dst_strd
 @   zero_cols

 .equ    src_stride_offset,     104
 .equ    pred_stride_offset,    108
 .equ    out_stride_offset,     112
 .equ    zero_cols_offset,      116
 .equ    zero_rows_offset,      120


 .text
 .align 4


 .set width_x_size_x5 ,   40
 .set width_x_size_x2 ,   32
 .set shift_stage1_idct ,   7
 .set shift_stage2_idct ,   12

 .globl ihevc_itrans_recon_8x8_a9q

 .extern g_ai2_ihevc_trans_8_transpose

 g_ai2_ihevc_trans_8_transpose_addr:
 .long g_ai2_ihevc_trans_8_transpose - ulbl1 - 8

 .type ihevc_itrans_recon_8x8_a9q, %function

 ihevc_itrans_recon_8x8_a9q:
 @//register usage.extern        - loading and until idct of columns
 @// cosine constants    -   d0
 @// sine constants      -   d1
 @// row 0 first half    -   d2      -   y0
 @// row 1 first half    -   d6      -   y1
 @// row 2 first half    -   d3      -   y2
 @// row 3 first half    -   d7      -   y3
 @// row 4 first half    -   d10     -   y4
 @// row 5 first half    -   d14     -   y5
 @// row 6 first half    -   d11     -   y6
 @// row 7 first half    -   d15     -   y7

 @// row 0 second half   -   d4      -   y0
 @// row 1 second half   -   d8      -   y1
 @// row 2 second half   -   d5      -   y2
 @// row 3 second half   -   d9      -   y3
 @// row 4 second half   -   d12     -   y4
 @// row 5 second half   -   d16     -   y5
 @// row 6 second half   -   d13     -   y6
 @// row 7 second half   -   d17     -   y7

     @// copy the input pointer to another register
     @// step 1 : load all constants
     stmfd       sp!,{r4-r12,lr}
     vpush       {d8  -  d15}

     ldr         r8, [sp, #pred_stride_offset]    @ prediction stride
     ldr         r7, [sp, #out_stride_offset]     @ destination stride
     ldr         r6, [sp, #src_stride_offset]     @ src stride
     ldr         r12, [sp, #zero_cols_offset]
     ldr         r11, [sp, #zero_rows_offset]
     mov         r6,r6,lsl #1                @ x sizeof(word16)
     add         r9,r0,r6, lsl #1            @ 2 rows

     add         r10,r6,r6, lsl #1           @ 3 rows

     sub         r10,r10, #8                 @ - 4 cols * sizeof(word16)
     sub         r5,r6, #8                   @ src_strd - 4 cols * sizeof(word16)

 @   ldr         r14,=g_imp4d_cxa8_idct_q15
     ldr         r14,g_ai2_ihevc_trans_8_transpose_addr
 ulbl1:
     add         r14,r14,pc
     vld1.16     {d0,d1},[r14]               @//d0,d1 are used for storing the constant data

     @//step 2 load all the input data
     @//step 3 operate first 4 colums at a time

     and         r11,r11,#0xff
     and         r12,r12,#0xff

     cmp         r11,#0xf0
     bge         skip_last4_rows


     vld1.16     d2,[r0]!
     vld1.16     d3,[r9]!
     vld1.16     d4,[r0],r5
     vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
     vld1.16     d5,[r9],r5
     vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
     vld1.16     d6,[r0]!
     vld1.16     d7,[r9]!
     vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
     vld1.16     d8,[r0],r10
     vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
     vld1.16     d9,[r9],r10
     vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
     vld1.16     d10,[r0]!
     vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
     vld1.16     d11,[r9]!
     vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
     vld1.16     d12,[r0],r5
     vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
     vld1.16     d13,[r9],r5
     vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
     vld1.16     d14,[r0]!
     vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
     vld1.16     d15,[r9]!
     vmull.s16   q11,d10,d0[0]               @// y4 * cos4(part of c0 and c1)
     vld1.16     d16,[r0],r10
     vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
     vld1.16     d17,[r9],r10

     @/* this following was activated when alignment is not there */
 @// vld1.16     d2,[r0]!
 @// vld1.16     d3,[r2]!
 @// vld1.16     d4,[r0]!
 @// vld1.16     d5,[r2]!
 @// vld1.16     d6,[r0]!
 @// vld1.16     d7,[r2]!
 @// vld1.16     d8,[r0],r3
 @// vld1.16     d9,[r2],r3
 @// vld1.16     d10,[r0]!
 @// vld1.16     d11,[r2]!
 @// vld1.16     d12,[r0]!
 @// vld1.16     d13,[r2]!
 @// vld1.16     d14,[r0]!
 @// vld1.16     d15,[r2]!
 @// vld1.16     d16,[r0],r3
 @// vld1.16     d17,[r2],r3


     vmlal.s16   q12,d14,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
     vmlsl.s16   q13,d14,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
     vmlal.s16   q14,d14,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
     vmlal.s16   q15,d14,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

     vmlsl.s16   q9,d11,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
     vmlal.s16   q3,d11,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

     vadd.s32    q5,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
     vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

     vmlal.s16   q12,d15,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
     vmlsl.s16   q13,d15,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
     vmlal.s16   q14,d15,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
     vmlsl.s16   q15,d15,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

     vadd.s32    q7,q5,q3                    @// a0 = c0 + d0(part of r0,r7)
     vsub.s32    q5,q5,q3                    @// a3 = c0 - d0(part of r3,r4)
     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)

     vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
     vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)

     vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
     vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)

     vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
     vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)

     vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
     vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)

     vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)


     b           last4_cols


 skip_last4_rows:


     vld1.16     d2,[r0]!
     vld1.16     d3,[r9]!
     vld1.16     d4,[r0],r5
     vld1.16     d5,[r9],r5
     vld1.16     d6,[r0]!
     vld1.16     d7,[r9]!
     vld1.16     d8,[r0],r10
     vld1.16     d9,[r9],r10


     vmov.s16    q6,#0
     vmov.s16    q8,#0


     vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
     vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
     vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
     vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

     vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
     vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
     vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
     vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

     vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
     vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)

     vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)


     vadd.s32    q7,q10,q3                   @// a0 = c0 + d0(part of r0,r7)
     vsub.s32    q5,q10,q3                   @// a3 = c0 - d0(part of r3,r4)
     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)

     vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
     vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)

     vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
     vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)

     vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
     vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)

     vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
     vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)

     vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)


 last4_cols:


     cmp         r12,#0xf0
     bge         skip_last4cols

     vmull.s16   q12,d8,d0[1]                @// y1 * cos1(part of b0)
     vmull.s16   q13,d8,d0[3]                @// y1 * cos3(part of b1)
     vmull.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
     vmull.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)

     vmlal.s16   q12,d9,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
     vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
     vmlsl.s16   q14,d9,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
     vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

     vmull.s16   q9,d5,d1[2]                 @// y2 * sin2 (q4 is freed by this time)(part of d1)
     vmull.s16   q4,d5,d0[2]                 @// y2 * cos2(part of d0)

     vmull.s16   q10,d4,d0[0]                @// y0 * cos4(part of c0 and c1)
     vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)

     vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
     vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
     vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
     vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

     vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
     vmlal.s16   q4,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

     vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
     vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

     vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
     vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
     vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
     vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)

     vadd.s32    q8,q6,q4                    @// a0 = c0 + d0(part of e0,e7)
     vsub.s32    q6,q6,q4                    @// a3 = c0 - d0(part of e3,e4)
     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of e2,e5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of e1,e6)

     vadd.s32    q10,q8,q12                  @// a0 + b0(part of e0)
     vsub.s32    q4,q8,q12                   @// a0 - b0(part of e7)

     vadd.s32    q12,q11,q14                 @// a2 + b2(part of e2)
     vsub.s32    q11,q11,q14                 @// a2 - b2(part of e5)

     vadd.s32    q14,q9,q13                  @// a1 + b1(part of e1)
     vsub.s32    q9,q9,q13                   @// a1 - b1(part of e6)

     vadd.s32    q13,q6,q15                  @// a3 + b3(part of e3)
     vsub.s32    q15,q6,q15                  @// a3 - b3(part of r4)

     vqrshrn.s32 d4,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d17,q4,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d5,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d16,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d8,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d13,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d9,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
     vqrshrn.s32 d12,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
     b           end_skip_last4cols


 skip_last4cols:


     vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing

     vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing


     vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
     vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....

     vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
     vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....


     vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
     vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
     vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
     vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

     vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
     vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
     vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
     vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

     vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
 @   vmull.s16   q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)

     vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
     vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)


     vsub.s32    q11,q10,q3                  @// a3 = c0 - d0(part of r3,r4)
     vadd.s32    q2,q10,q3                   @// a0 = c0 + d0(part of r0,r7)


     vadd.s32    q1,q2,q12

     vsub.s32    q3,q2,q12

     vadd.s32    q4,q11,q15

     vsub.s32    q12,q11,q15

     vqrshrn.s32 d5,q4,#shift_stage2_idct
     vqrshrn.s32 d2,q1,#shift_stage2_idct
     vqrshrn.s32 d9,q3,#shift_stage2_idct
     vqrshrn.s32 d6,q12,#shift_stage2_idct

     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


     vadd.s32    q15,q11,q14

     vsub.s32    q12,q11,q14

     vadd.s32    q14,q9,q13

     vsub.s32    q11,q9,q13
     vqrshrn.s32 d4,q15,#shift_stage2_idct
     vqrshrn.s32 d7,q12,#shift_stage2_idct
     vqrshrn.s32 d3,q14,#shift_stage2_idct
     vqrshrn.s32 d8,q11,#shift_stage2_idct


     vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)

     vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
     vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
     vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)

     vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
     vtrn.16     d2,d3
     vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
     vtrn.16     d4,d5
     vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
     vtrn.16     d6,d7
     vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
     vtrn.16     d8,d9
     vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
     vtrn.32     d2,d4

     vtrn.32     d3,d5
     vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
     vtrn.32     d6,d8
     vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
     vtrn.32     d7,d9


     add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data


     add         r5,r8,r8, lsl #1            @


     add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data


     add         r10,r7,r7, lsl #1           @


     vswp        d3,d6


     vswp        d5,d8


     vsub.s32    q11,q10,q7                  @// a3 = c0 - d0(part of r3,r4)
     vadd.s32    q6,q10,q7                   @// a0 = c0 + d0(part of r0,r7)


     vadd.s32    q0,q6,q12


     vsub.s32    q12,q6,q12


     vadd.s32    q6,q11,q15


     vsub.s32    q7,q11,q15

     vqrshrn.s32 d10,q0,#shift_stage2_idct
     vqrshrn.s32 d17,q12,#shift_stage2_idct
     vqrshrn.s32 d13,q6,#shift_stage2_idct
     vqrshrn.s32 d14,q7,#shift_stage2_idct

     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


     vadd.s32    q0,q11,q14


     vsub.s32    q12,q11,q14


     vadd.s32    q14,q9,q13


     vsub.s32    q13,q9,q13
     vld1.8      d18,[r2],r8

     vqrshrn.s32 d12,q0,#shift_stage2_idct
     vld1.8      d20,[r2],r5


     vqrshrn.s32 d15,q12,#shift_stage2_idct
     vld1.8      d19,[r2],r8


     vqrshrn.s32 d11,q14,#shift_stage2_idct
     vld1.8      d22,[r4],r8


     vqrshrn.s32 d16,q13,#shift_stage2_idct
     vld1.8      d21,[r2],r5


     b           pred_buff_addition
 end_skip_last4cols:


 @/* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
     vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing
     vtrn.16     q2,q4                       @//[r3,r1],[r2,r0] second qudrant transposing
     vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing
     vtrn.16     q6,q8                       @//[r7,r5],[r6,r4] fourth qudrant transposing

     vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
     vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....
     vtrn.32     d4,d5                       @//r0,r1,r2,r3 second qudrant transposing continued.....
     vtrn.32     d8,d9                       @//r0,r1,r2,r3 second qudrant transposing continued.....
     vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
     vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....
     vtrn.32     d12,d13                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....
     vtrn.32     d16,d17                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....

     @//step6 operate on first four rows and find their idct
     @//register usage.extern        - storing and idct of rows
 @// cosine constants    -   d0
 @// sine constants      -   d1
 @// element 0 first four    -   d2      -   y0
 @// element 1 first four    -   d6      -   y1
 @// element 2 first four    -   d3      -   y2
 @// element 3 first four    -   d7      -   y3
 @// element 4 first four    -   d4      -   y4
 @// element 5 first four    -   d8      -   y5
 @// element 6 first four    -   d5      -   y6
 @// element 7 first four    -   d9      -   y7
 @// element 0 second four   -   d10     -   y0
 @// element 1 second four   -   d14     -   y1
 @// element 2 second four   -   d11     -   y2
 @// element 3 second four   -   d15     -   y3
 @// element 4 second four   -   d12     -   y4
 @// element 5 second four   -   d16     -   y5
 @// element 6 second four   -   d13     -   y6
 @// element 7 second four   -   d17     -   y7

     @// map between first kernel code seq and current
 @//     d2  ->  d2
 @//     d6  ->  d6
 @//     d3  ->  d3
 @//     d7  ->  d7
 @//     d10 ->  d4
 @//     d14 ->  d8
 @//     d11 ->  d5
 @//     d15 ->  d9
 @//     q3  ->  q3
 @//     q5  ->  q2
 @//     q7  ->  q4

     vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
     vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
     vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
     vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

     vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
     vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
     vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
     vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

     vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
     vmull.s16   q11,d4,d0[0]                @// y4 * cos4(part of c0 and c1)

     vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
     vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)


     vmlal.s16   q12,d8,d1[1]                @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
     vmlsl.s16   q13,d8,d0[1]                @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
     vmlal.s16   q14,d8,d1[3]                @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
     vmlal.s16   q15,d8,d0[3]                @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

     vmlsl.s16   q9,d5,d0[2]                 @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
     vmlal.s16   q3,d5,d1[2]                 @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

     vadd.s32    q1,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
     vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

     vmlal.s16   q12,d9,d1[3]                @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
     vmlsl.s16   q13,d9,d1[1]                @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
     vmlal.s16   q14,d9,d0[3]                @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
     vmlsl.s16   q15,d9,d0[1]                @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

     vsub.s32    q11,q1,q3                   @// a3 = c0 - d0(part of r3,r4)
     vadd.s32    q2,q1,q3                    @// a0 = c0 + d0(part of r0,r7)


     vadd.s32    q1,q2,q12

     vsub.s32    q3,q2,q12

     vadd.s32    q4,q11,q15

     vsub.s32    q12,q11,q15

     vqrshrn.s32 d5,q4,#shift_stage2_idct
     vqrshrn.s32 d2,q1,#shift_stage2_idct
     vqrshrn.s32 d9,q3,#shift_stage2_idct
     vqrshrn.s32 d6,q12,#shift_stage2_idct

     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


     vadd.s32    q15,q11,q14

     vsub.s32    q12,q11,q14

     vadd.s32    q14,q9,q13

     vsub.s32    q11,q9,q13
     vqrshrn.s32 d4,q15,#shift_stage2_idct
     vqrshrn.s32 d7,q12,#shift_stage2_idct
     vqrshrn.s32 d3,q14,#shift_stage2_idct
     vqrshrn.s32 d8,q11,#shift_stage2_idct


     vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)

     vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
     vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
     vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)

     vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
     vtrn.16     d2,d3
     vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
     vtrn.16     d4,d5
     vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
     vtrn.16     d6,d7
     vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
     vtrn.16     d8,d9
     vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
     vtrn.32     d2,d4
     vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)
     vtrn.32     d3,d5
     vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
     vtrn.32     d6,d8
     vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
     vtrn.32     d7,d9
     vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)

     add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
     vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)

     add         r5,r8,r8, lsl #1            @
     vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)

     add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data
     vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

     add         r10,r7,r7, lsl #1           @
     vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)


     vmlal.s16   q7,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

     vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
     vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

     vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
     vswp        d3,d6
     vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)

     vswp        d5,d8
     vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
     vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

     vsub.s32    q11,q6,q7                   @// a3 = c0 - d0(part of r3,r4)
     vadd.s32    q6,q6,q7                    @// a0 = c0 + d0(part of r0,r7)


     vadd.s32    q0,q6,q12


     vsub.s32    q12,q6,q12


     vadd.s32    q6,q11,q15


     vsub.s32    q7,q11,q15

     vqrshrn.s32 d10,q0,#shift_stage2_idct
     vqrshrn.s32 d17,q12,#shift_stage2_idct
     vqrshrn.s32 d13,q6,#shift_stage2_idct
     vqrshrn.s32 d14,q7,#shift_stage2_idct

     vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
     vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


     vadd.s32    q0,q11,q14


     vsub.s32    q12,q11,q14


     vadd.s32    q14,q9,q13


     vsub.s32    q13,q9,q13
     vld1.8      d18,[r2],r8

     vqrshrn.s32 d12,q0,#shift_stage2_idct
     vld1.8      d20,[r2],r5


     vqrshrn.s32 d15,q12,#shift_stage2_idct
     vld1.8      d19,[r2],r8


     vqrshrn.s32 d11,q14,#shift_stage2_idct
     vld1.8      d22,[r4],r8


     vqrshrn.s32 d16,q13,#shift_stage2_idct
     vld1.8      d21,[r2],r5


 pred_buff_addition:


     vtrn.16     d10,d11
     vld1.8      d24,[r4],r5

     vtrn.16     d12,d13
     vld1.8      d23,[r4],r8

     vaddw.u8    q1,q1,d18
     vld1.8      d25,[r4],r5

     vtrn.16     d14,d15
     vaddw.u8    q2,q2,d22

     vtrn.16     d16,d17
     vaddw.u8    q3,q3,d20

     vtrn.32     d10,d12
     vaddw.u8    q4,q4,d24

     vtrn.32     d11,d13
     vtrn.32     d14,d16
     vtrn.32     d15,d17

     vswp        d11,d14
     vswp        d13,d16

 @ row values stored in the q register.

 @q1 :r0
 @q3: r1
 @q2: r2
 @q4: r3
 @q5: r4
 @q7: r5
 @q6: r6
 @q8: r7


 @/// adding the prediction buffer


     @ load prediction data


     @adding recon with prediction


     vaddw.u8    q5,q5,d19
     vqmovun.s16 d2,q1
     vaddw.u8    q7,q7,d21
     vqmovun.s16 d4,q2
     vaddw.u8    q6,q6,d23
     vqmovun.s16 d6,q3
     vaddw.u8    q8,q8,d25
     vqmovun.s16 d8,q4


     vst1.8      {d2},[r3],r7
     vqmovun.s16 d10,q5
     vst1.8      {d6},[r3],r10
     vqmovun.s16 d14,q7
     vst1.8      {d4},[r0],r7
     vqmovun.s16 d12,q6
     vst1.8      {d8},[r0],r10
     vqmovun.s16 d16,q8


     vst1.8      {d10},[r3],r7
     vst1.8      {d14},[r3],r10
     vst1.8      {d12},[r0],r7
     vst1.8      {d16},[r0],r10


     vpop        {d8  -  d15}
     ldmfd       sp!,{r4-r12,pc}