| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| @/** |
| @******************************************************************************* |
| @* @file |
| @* ih264_resi_trans_a9.s |
| @* |
| @* @brief |
| @* Contains function definitions for residual and forward trans |
| @* |
| @* @author |
| @* Ittiam |
| @* |
| @* @par List of Functions: |
| @* ih264_resi_trans_4x4_a9 |
| @* ih264_resi_trans_8x8_a9 |
| @* @remarks |
| @* None |
| @* |
| @******************************************************************************* |
| |
| |
| .text |
| .p2align 2 |
| @***************************************************************************** |
| @* |
| @* Function Name : ih264_resi_trans_4x4_a9 |
| @* Description : This function does cf4 of H264 followed by and approximate scaling |
| @* |
| @* Arguments : |
| @ R0 :pointer to src buffer |
| @ R1 :pointer to pred buffer |
| @ R2 :pointer to dst buffer |
| @ R3 :src_stride |
| @ STACk :pred_stride,dst_stride |
| |
| @* Values Returned : NONE |
| @* |
| @* Register Usage : |
| @* Stack Usage : |
| @* Cycles : Around |
| @* Interruptiaility : Interruptable |
| @* |
| @* Known Limitations |
| @* \Assumptions : |
| @* |
| @* Revision History : |
| @* DD MM YYYY Author(s) Changes |
| @* 30 12 2009 100633 First version |
| @* |
| @***************************************************************************** |
| |
| |
| .global ih264_resi_trans_4x4_a9 |
| .extern g_scal_coff_h264_4x4 |
| g_scal_coff_h264_4x4_addr: |
| .long g_scal_coff_h264_4x4 - 4x4lbl - 8 |
| |
| ih264_resi_trans_4x4_a9: |
| |
| @R0 :pointer to src buffer |
| @R1 :pointer to pred buffer |
| @R2 :pointer to dst buffer |
| @R3 :src_stride |
| @STACk :pred_stride,dst_stride |
| |
| push {r4-r12, lr} @push all the variables first |
| |
| mov r6, sp |
| add r6, r6, #40 @decrement stack pointer,to accomodate two variables |
| ldmfd r6, {r4-r5} @load the strides into registers |
| @R4 pred_stride |
| @R5 dst_stride |
| |
| |
| @we have to give the stride as post inrement in VLDR1 |
| @but since thr stride is from end of row 1 to start of row 2, |
| @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes) |
| @ADD R3,#4 |
| @ADD R4,#4 |
| @ADD R5,#4 |
| @in case of dst the stride represnts 16 bit ie 2*8bits |
| @hence we need to add #4 to it and thenm multiply by 2 |
| @--------------------function loading done------------------------ |
| |
| @lets find residual |
| @data is like 1a -> d0[1:31] d0[32:64] |
| @ a b c d # # # # |
| vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer |
| vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer |
| @ data is like 1a -> q4[1:63] q4[64:148] |
| @ d8[1:63] d9[1:63] |
| @ a b c d # # # # |
| |
| vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0] |
| vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0] |
| |
| vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0] |
| vsubl.u8 q0, d30, d31 @curr - pred for row one |
| |
| vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0] |
| vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0 |
| |
| vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0] |
| |
| vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0] |
| vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2] |
| |
| lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values |
| ldr r6, g_scal_coff_h264_4x4_addr |
| 4x4lbl: |
| add r6, r6, pc @ load the address of global array |
| |
| vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6 |
| |
| @after this |
| @D0 -> 1a |
| @D2 -> 2a |
| @D4 -> 3a |
| @D6 -> 4a |
| |
| @transpose the matrix so that we can do the horizontal transform first |
| @#1 #2 #3 #4 |
| @a b c d ---- D0 |
| @e f g h -----D2 |
| @i j k l -----D4 |
| @m n o p -----D6 |
| @transpose the inner 2x2 blocks |
| vtrn.16 d0, d2 |
| vld1.s16 {q10}, [r6]! @ load the scaling values 0-7; |
| vtrn.16 d4, d6 |
| @a e c g |
| @b f d h |
| @i m k o |
| @j n l p |
| vtrn.32 d0, d4 |
| vtrn.32 d2, d6 |
| @a e i m #1 -- D0 --- x4 |
| @b f j n #2 -- D2 --- x5 |
| @c g k o #3 -- D4 ----x6 |
| @d h l p #4 -- D6 ----x7 |
| |
| @we have loaded the residuals into the registers , now we need to add and subtract them |
| @let us do the horiz transform first |
| |
| vsub.s16 d5, d2, d4 @x2 = x5-x6 |
| vsub.s16 d7, d0, d6 @x3 = x4-x7; |
| |
| vadd.s16 d3, d2, d4 @x1 = x5+x6 |
| vadd.s16 d1, d0, d6 @x0 = x4+x7 |
| |
| |
| vshl.s16 d31, d7, #1 @ |
| vshl.s16 d30, d5, #1 @ |
| |
| vadd.s16 d0, d1, d3 @x0 + x1; |
| vsub.s16 d4, d1, d3 @x0 - x1; |
| |
| vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2; |
| vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft); |
| |
| @taking transform again so as to make do vert transform |
| vtrn.16 d0, d2 |
| vtrn.16 d4, d6 |
| |
| vtrn.32 d0, d4 |
| vtrn.32 d2, d6 |
| |
| @let us do vertical transform |
| @same code as horiz |
| |
| vadd.s16 d1, d0, d6 @x0 = x4+x7 |
| vadd.s16 d3, d2, d4 @x1 = x5+x6 |
| vsub.s16 d7, d0, d6 @x3 = x4-x7; |
| vsub.s16 d5, d2, d4 @x2 = x5-x6 |
| |
| |
| @Since we are going to do scal / quant or whatever, we are going to divide by |
| @a 32 bit number. So we have to expand the values |
| |
| @VADDL.S16 Q12,D1,D3;x0 + x1 |
| @VSUBL.S16 Q14,D1,D3;x0 - x1 |
| |
| @VSHL.S16 D8,D5,#1; |
| @VSHL.S16 D9,D7,#1; |
| |
| @VADDL.S16 Q13,D9,D5 ; + x2 |
| @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft) |
| |
| @scaling follows |
| |
| @now we need to do the scaling,so load the scaling matrix |
| @mutliplying by the scaling coeffient; store the results from q5-q8 ; |
| |
| vadd.s16 d24, d3, d1 @x4 = x0 + x1 |
| vsub.s16 d28, d1, d3 @x6 = x0 - x1 |
| |
| vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft) |
| vmull.s16 q4, d24, d20 @x4*s0 |
| |
| vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft) |
| |
| vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2 |
| vmull.s16 q5, d26, d21 @x5*s1 |
| |
| vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride |
| |
| vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients |
| |
| vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft) |
| |
| vmull.s16 q6, d28, d20 @x6*s2 |
| vst1.s32 {q5}, [r2], r5 |
| |
| vmull.s16 q7, d30, d21 @x7*s3 |
| |
| |
| vst1.s32 {q6}, [r2], r5 |
| vst1.s32 {q7}, [r2] |
| |
| pop {r4-r12, pc} @pop back all variables |
| |
| |
| |
| |
| @***************************************************************************** |
| @* Function Name : ih264_resi_trans_8x8_a9 |
| @* Description : This function does cf8 followd by an approximate normalization of H264 |
| @* |
| @* Arguments : |
| @* R0 :pointer to src buffer |
| @ R1 :pointer to pred buffer |
| @ R2 :pointer to dst buffer |
| @ R3 :src_stride |
| @ STACk :pred_stride,dst_st |
| @* |
| @* |
| @* Values Returned : NONE |
| @* |
| @* Register Usage : |
| @* Stack Usage : |
| @* Cycles : Around |
| @* Interruptiaility : Interruptable |
| @* |
| @* Known Limitations |
| @* \Assumptions : |
| @* |
| @* Revision History : |
| @* DD MM YYYY Author(s) Changes |
| @* 30 12 2009 100633 First version |
| @* |
| @***************************************************************************** |
| |
| |
| .global ih264_resi_trans_8x8_a9 |
| .extern g_scal_coff_h264_8x8 |
| g_scal_coff_h264_8x8_addr: |
| .long g_scal_coff_h264_8x8 - 8x8lbl - 8 |
| |
| |
| ih264_resi_trans_8x8_a9: |
| |
| @R0 :pointer to src buffer |
| @R1 :pointer to pred buffer |
| @R2 :pointer to dst buffer |
| @R3 :src_stride |
| @STACk :pred_stride,dst_stride |
| |
| push {r4-r12, lr} @push all the variables first |
| |
| mov r6, sp |
| add r6, r6, #40 @decrement stack pointer,to accomodate two variables |
| ldmfd r6, {r4-r5} @load the strides into registers |
| @R4 pred_stride |
| @R5 dst_stride |
| |
| @we have to give the stride as post inrement in vst1 |
| @in case of dst the stride represnts 16 bit ie 2*8bits |
| @hence we need to add #4 to it and thenm multiply by 2 |
| @--------------------function loading done------------------------ |
| |
| @lets find residual |
| @data is like 1a -> d0[1:31] d0[32:64] |
| @ a b c d # # # # |
| vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer |
| vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer |
| |
| vld1.u8 d28, [r0], r3 @src rw2 |
| vld1.u8 d29, [r1], r4 @pred rw2 |
| vsubl.u8 q0, d30, d31 @src-pred rw1 |
| |
| vld1.u8 d26, [r0], r3 |
| vld1.u8 d27, [r1], r4 |
| vsubl.u8 q1, d28, d29 |
| |
| vld1.u8 d24, [r0], r3 |
| vld1.u8 d25, [r1], r4 |
| vsubl.u8 q2, d26, d27 |
| |
| vld1.u8 d22, [r0], r3 |
| vld1.u8 d23, [r1], r4 |
| vsubl.u8 q3, d24, d25 |
| |
| vld1.u8 d20, [r0], r3 |
| vld1.u8 d21, [r1], r4 |
| vsubl.u8 q4, d22, d23 |
| |
| vld1.u8 d18, [r0], r3 |
| vld1.u8 d19, [r1], r4 |
| vsubl.u8 q5, d20, d21 |
| |
| vld1.u8 d16, [r0], r3 |
| vld1.u8 d17, [r1], r4 |
| vsubl.u8 q6, d18, d19 |
| |
| lsl r5, r5, #2 |
| |
| |
| vsubl.u8 q7, d16, d17 |
| |
| @after this |
| @Q0 -> 1a |
| @Q1 -> 2a |
| @Q2 -> 3a |
| @Q3 -> 4a |
| @Q4 -> 5a |
| @Q5 -> 6a |
| @Q6 -> 7a |
| @Q7 -> 8a |
| |
| @transpose the matrix so that we can do the horizontal transform first |
| |
| @transpose the inner 2x2 blocks |
| vtrn.16 q0, q1 |
| vtrn.16 q2, q3 |
| vtrn.16 q4, q5 |
| vtrn.16 q6, q7 |
| |
| @transpose the inner 4x4 blocks |
| vtrn.32 q0, q2 |
| vtrn.32 q1, q3 |
| |
| vtrn.32 q4, q6 |
| vtrn.32 q5, q7 |
| |
| @transpose the outer 8x8 blocks |
| vswp d1, d8 |
| vswp d7, d14 |
| vswp d3, d10 |
| vswp d5, d12 |
| @transpose done |
| |
| @@this point we will have data in Q0-Q7 |
| @Q7 will be populated within 2 clock cycle |
| @all others are availabe @ this clock cycle |
| |
| @we have loaded the residuals into the registers , now we need to add and subtract them |
| @let us do the horiz transform first |
| |
| vadd.s16 q8, q0, q7 @ a0 = r0 + r7; |
| vadd.s16 q9, q1, q6 @ a1 = r1 + r6; |
| vadd.s16 q10, q2, q5 @ a2 = r2 + r5; |
| vadd.s16 q11, q3, q4 @ a3 = r3 + r4; |
| |
| vsub.s16 q12, q0, q7 @ b0 = r0 - r7; |
| vsub.s16 q13, q1, q6 @ b1 = r1 - r6; |
| vsub.s16 q15, q3, q4 @ b3 = r3 - r4; |
| vsub.s16 q14, q2, q5 @ b2 = r2 - r5; |
| |
| vadd.s16 q1, q8, q11 @ a4 = a0 + a3; |
| vadd.s16 q3, q9, q10 @ a5 = a1 + a2; |
| vsub.s16 q7, q9, q10 @ a7 = a1 - a2; |
| vsub.s16 q5, q8, q11 @ a6 = a0 - a3; |
| |
| ldr r6, g_scal_coff_h264_8x8_addr |
| 8x8lbl: |
| add r6, r6, pc @ load the address of global array |
| |
| vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; |
| vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); |
| |
| vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; |
| |
| vadd.s16 q2, q5, q8 @ |
| |
| |
| vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; |
| vsub.s16 q6, q9, q7 @ |
| |
| @do not change Q0,Q2.Q4,Q6 they contain results |
| @Q1,Q3,Q5,Q7 TO STORE RESULTS |
| @Q8 Q9 Q10 Q11 USE @WILL |
| |
| vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) |
| vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) |
| vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) |
| vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) |
| |
| vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); |
| vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); |
| vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); |
| vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); |
| |
| vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); |
| vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); |
| vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); |
| vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); |
| |
| vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); |
| vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); |
| vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); |
| vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); |
| |
| vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) |
| vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); |
| vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); |
| vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); |
| |
| |
| vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); |
| vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); |
| vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); |
| vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; |
| |
| @------------horiz transform done------------------------- |
| @results are in Q0-Q7 |
| @all other neon registes can be used at will |
| |
| @doing vertical transform |
| @code exact copy of horiz transform above |
| |
| @transpose the inner 2x2 blocks |
| vtrn.16 q0, q1 |
| vtrn.16 q2, q3 |
| vtrn.16 q4, q5 |
| vtrn.16 q6, q7 |
| |
| @transpose the inner 4x4 blocks |
| vtrn.32 q0, q2 |
| vtrn.32 q1, q3 |
| |
| vtrn.32 q4, q6 |
| vtrn.32 q5, q7 |
| |
| @transpose the outer 8x8 blocks |
| vswp d1, d8 |
| vswp d3, d10 |
| vswp d5, d12 |
| vswp d7, d14 |
| |
| @transpose done |
| |
| vadd.s16 q8, q0, q7 @ a0 = r0 + r7; |
| vadd.s16 q9, q1, q6 @ a1 = r1 + r6; |
| vadd.s16 q10, q2, q5 @ a2 = r2 + r5; |
| vadd.s16 q11, q3, q4 @ a3 = r3 + r4; |
| |
| vsub.s16 q12, q0, q7 @ b0 = r0 - r7; |
| vsub.s16 q13, q1, q6 @ b1 = r1 - r6; |
| vsub.s16 q14, q2, q5 @ b2 = r2 - r5; |
| vsub.s16 q15, q3, q4 @ b3 = r3 - r4; |
| |
| vadd.s16 q1, q8, q11 @ a4 = a0 + a3; |
| vadd.s16 q3, q9, q10 @ a5 = a1 + a2; |
| vsub.s16 q5, q8, q11 @ a6 = a0 - a3; |
| vsub.s16 q7, q9, q10 @ a7 = a1 - a2; |
| |
| |
| vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; |
| |
| vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); |
| @DSHIFT_TO_0 Q8,Q7,#1,#0 |
| vadd.s16 q2, q5, q8 @ |
| |
| vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; |
| |
| vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; |
| vsub.s16 q6, q9, q7 @ |
| |
| @do not change Q0,Q2.Q4,Q6 they contain results |
| @Q1,Q3,Q5,Q7 TO STORE RESULTS |
| @Q8 Q9 Q10 Q11 USE @WILL |
| |
| vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) |
| vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) |
| vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) |
| vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) |
| |
| |
| vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); |
| vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); |
| vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); |
| vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); |
| |
| vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); |
| vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); |
| vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); |
| vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); |
| |
| vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); |
| vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); |
| vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); |
| vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); |
| |
| vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) |
| vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); |
| vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); |
| vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); |
| |
| |
| @since we are going to scal by small values, we need not expand the guys to 32 bit bit values |
| vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); |
| vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; |
| vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); |
| vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); |
| |
| @------------vert transform done------------------------- |
| @results are in Q0-Q7 |
| @all other neon registes can be used at will |
| |
| @scaling |
| @since the 8x8 scaling matrix repeats in 1x4,1x4 block , |
| @we need only load 4 values for each row and in total 4 rows |
| vld1.s16 {q14-q15}, [r6] @ |
| |
| @since we need to get a 32 bit o/p for two 16 bit multiplications |
| @we need a VMULL instruction |
| @-----------------------------first and second row |
| |
| vmull.s16 q8, d0, d28 @scale the first row first 4 elem |
| vmull.s16 q9, d28, d1 @scale the second row last 4 elemts |
| |
| vmull.s16 q10, d2, d29 @ scale second row first 4 elem |
| vmull.s16 q11, d29, d3 @scale the second row last 4 elem |
| vmull.s16 q12, d4, d30 @scale third row first 4 elem |
| |
| vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete |
| |
| vmull.s16 q13, d30, d5 @scale the third row last 4 elem |
| vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem |
| |
| |
| vst1.s32 {q10, q11}, [r2], r5 @store the second row complete |
| |
| @------------------------------- 3rd and 4th row |
| |
| vmull.s16 q9, d31, d7 @scale the fourth row second column |
| |
| vst1.s32 {q12, q13}, [r2], r5 @store the third row complete |
| |
| vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms |
| vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems |
| |
| vmull.s16 q12, d10, d29 @scale the 6th row first4 elements |
| |
| |
| vst1.s32 {q8, q9}, [r2], r5 @store fifth row |
| |
| @--------------------------------5th and 6th row |
| |
| vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems |
| |
| vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms |
| |
| vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements |
| |
| vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms |
| vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms |
| |
| |
| vst1.s32 {q12, q13}, [r2], r5 @store 6th row |
| |
| @----------------------------------7th and 8th row |
| vmull.s16 q11, d31, d15 @scale 8th row second 4 elms |
| |
| vst1.s32 {q8, q9}, [r2], r5 @store 7th row |
| vst1.s32 {q10, q11}, [r2], r5 @store 8th row |
| |
| @----------------------------------done writing |
| |
| pop {r4-r12, pc} @pop back all variables |
| |
| |
| |
| |
| |
| |