| @/****************************************************************************** |
| @ * |
| @ * Copyright (C) 2015 The Android Open Source Project |
| @ * |
| @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| @ * you may not use this file except in compliance with the License. |
| @ * You may obtain a copy of the License at: |
| @ * |
| @ * http://www.apache.org/licenses/LICENSE-2.0 |
| @ * |
| @ * Unless required by applicable law or agreed to in writing, software |
| @ * distributed under the License is distributed on an "AS IS" BASIS, |
| @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @ * See the License for the specific language governing permissions and |
| @ * limitations under the License. |
| @ * |
| @ ***************************************************************************** |
| @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| @*/ |
| |
| @/* |
| @//---------------------------------------------------------------------------- |
| @// File Name : impeg2_inter_pred.s |
| @// |
| @// Description : This file has motion compensation related |
| @// interpolation functions on Neon + CortexA-8 platform |
| @// |
| @// Reference Document : |
| @// |
| @// Revision History : |
| @// Date Author Detail Description |
| @// ------------ ---------------- ---------------------------------- |
| @// 18 jun 2010 S Hamsalekha Created |
| @// |
| @//------------------------------------------------------------------------- |
| @*/ |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Include Files |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| .text |
| .p2align 2 |
| |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Struct/Union Types and Define |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Static Global Data section variables |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| @// -------------------------- NONE -------------------------------------------- |
| |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Static Prototype Functions |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| @// -------------------------- NONE -------------------------------------------- |
| |
| @/* |
| @// ---------------------------------------------------------------------------- |
| @// Exported functions |
| @// ---------------------------------------------------------------------------- |
| @*/ |
| |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_copy_mb_a9q() |
| @// |
| @// Detail Description : Copies one MB worth of data from src to the dst |
| @// |
| @// Inputs : r0 - pointer to src |
| @// r1 - pointer to dst |
| @// r2 - source width |
| @// r3 - destination width |
| @// Registers Used : r4, r5, d0, d1 |
| @// |
| @// Stack Usage : 12 bytes |
| @// |
| @// Outputs : |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| |
| |
| .global impeg2_copy_mb_a9q |
| |
| |
| impeg2_copy_mb_a9q: |
| |
| stmfd r13!, {r4, r5, r14} |
| |
| |
| ldr r4, [r0] @src->y |
| ldr r5, [r1] @dst->y |
| @Read one row of data from the src |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| |
| @//Repeat 15 times for y |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| vld1.8 {d0, d1}, [r4], r2 @Load and increment src |
| vst1.8 {d0, d1}, [r5], r3 @Store and increment dst |
| |
| mov r2, r2, lsr #1 @src_offset /= 2 |
| mov r3, r3, lsr #1 @dst_offset /= 2 |
| |
| ldr r4, [r0, #4] @src->u |
| ldr r5, [r1, #4] @dst->u |
| @Read one row of data from the src |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| |
| @//Repeat 7 times for u |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| |
| ldr r4, [r0, #8] @src->v |
| ldr r5, [r1, #8] @dst->v |
| @Read one row of data from the src |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| |
| @//Repeat 7 times for v |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| vld1.8 {d0}, [r4], r2 @Load and increment src |
| vst1.8 {d0}, [r5], r3 @Store and increment dst |
| |
| ldmfd r13!, {r4, r5, pc} |
| |
| |
| |
| |
| @/* |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_mc_fullx_halfy_8x8_a9q() |
| @// |
| @// Detail Description : This function pastes the reference block in the |
| @// current frame buffer.This function is called for |
| @// blocks that are not coded and have motion vectors |
| @// with a half pel resolution. |
| @// |
| @// Inputs : r0 - out : Current Block Pointer |
| @// r1 - ref : Refernce Block Pointer |
| @// r2 - ref_wid : Refernce Block Width |
| @// r3 - out_wid ; Current Block Width |
| @// |
| @// Registers Used : D0-D9 |
| @// |
| @// Stack Usage : 4 bytes |
| @// |
| @// Outputs : The Motion Compensated Block |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| .global impeg2_mc_fullx_halfy_8x8_a9q |
| |
| impeg2_mc_fullx_halfy_8x8_a9q: |
| |
| stmfd r13!, {r14} |
| add r14, r1, r2 |
| mov r2, r2, lsl #1 |
| |
| @/* Load 8 + 1 rows from reference block */ |
| @/* Do the addition with out rounding off as rounding value is 1 */ |
| vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0 |
| vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2 |
| vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4 |
| vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6 |
| vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1 |
| vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3 |
| vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9 |
| vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5 |
| vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1 |
| vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7 |
| vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3 |
| vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8 |
| vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5 |
| |
| add r14, r0, r3 |
| mov r3, r3, lsl #1 |
| |
| @/* Store the eight rows calculated above */ |
| vst1.8 {d2}, [r14], r3 @// second row hence D2 |
| vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7 |
| vst1.8 {d0}, [r0], r3 @// first row hence D0 |
| vst1.8 {d9}, [r14], r3 @// fourth row hence D9 |
| vst1.8 {d4}, [r0], r3 @// third row hence D4 |
| vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3 |
| vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1 |
| vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7 |
| vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5 |
| |
| ldmfd sp!, {pc} |
| |
| |
| |
| |
| |
| |
| @/* |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_mc_halfx_fully_8x8_a9q() |
| @// |
| @// Detail Description : This function pastes the reference block in the |
| @// current frame buffer.This function is called for |
| @// blocks that are not coded and have motion vectors |
| @// with a half pel resolutionand VopRoundingType is 0 .. |
| @// |
| @// Inputs : r0 - out : Current Block Pointer |
| @// r1 - ref : Refernce Block Pointer |
| @// r2 - ref_wid : Refernce Block Width |
| @// r3 - out_wid ; Current Block Width |
| @// |
| @// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22 |
| |
| @// |
| @// Stack Usage : 8 bytes |
| @// |
| @// Outputs : The Motion Compensated Block |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| |
| |
| .global impeg2_mc_halfx_fully_8x8_a9q |
| |
| |
| |
| impeg2_mc_halfx_fully_8x8_a9q: |
| |
| stmfd sp!, {r12, lr} |
| |
| add r14, r1, r2, lsl #2 |
| |
| add r12, r0, r3, lsl#2 |
| |
| vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 |
| |
| vld1.8 {d2, d3}, [r14], r2 @ row5 |
| |
| |
| vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 |
| |
| vld1.8 {d6, d7}, [r14], r2 @row6 |
| |
| |
| vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1 |
| |
| vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5 |
| |
| vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2 |
| |
| vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6 |
| |
| |
| vld1.8 {d9, d10}, [r1], r2 @load row3 |
| |
| vld1.8 {d13, d14}, [r14], r2 @load row7 |
| |
| vld1.8 {d17, d18}, [r1], r2 @load row4 |
| |
| vld1.8 {d21, d22}, [r14], r2 @load row8 |
| |
| |
| vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3 |
| |
| vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7 |
| |
| |
| |
| vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4 |
| |
| vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8 |
| |
| |
| vrhadd.u8 q0, q0, q4 @operate on row1 and row3 |
| |
| vrhadd.u8 q1, q1, q6 @operate on row5 and row7 |
| |
| |
| vrhadd.u8 q2, q2, q8 @operate on row2 and row4 |
| |
| |
| |
| vrhadd.u8 q3, q3, q10 @operate on row6 and row8 |
| |
| vst1.8 d0, [r0], r3 @store row1 |
| |
| vst1.8 d2, [r12], r3 @store row5 |
| |
| vst1.8 d4, [r0], r3 @store row2 |
| |
| vst1.8 d6, [r12], r3 @store row6 |
| |
| vst1.8 d1, [r0], r3 @store row3 |
| |
| vst1.8 d3, [r12], r3 @store row7 |
| |
| vst1.8 d5, [r0], r3 @store row4 |
| |
| vst1.8 d7, [r12], r3 @store row8 |
| |
| |
| |
| ldmfd sp!, {r12, pc} |
| |
| |
| |
| |
| |
| |
| |
| |
| @/* |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_mc_halfx_halfy_8x8_a9q() |
| @// |
| @// Detail Description : This function pastes the reference block in the |
| @// current frame buffer.This function is called for |
| @// blocks that are not coded and have motion vectors |
| @// with a half pel resolutionand VopRoundingType is 0 .. |
| @// |
| @// Inputs : r0 - out : Current Block Pointer |
| @// r1 - ref : Refernce Block Pointer |
| @// r2 - ref_wid : Refernce Block Width |
| @// r3 - out_wid ; Current Block Width |
| @// |
| @// Registers Used : r14, q0-q15 |
| |
| @// |
| @// Stack Usage : 4 bytes |
| @// |
| @// Outputs : The Motion Compensated Block |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| |
| .global impeg2_mc_halfx_halfy_8x8_a9q |
| |
| impeg2_mc_halfx_halfy_8x8_a9q: |
| |
| stmfd sp!, {r14} |
| |
| add r14, r1, r2, lsl #2 |
| |
| vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 |
| |
| vld1.8 {d2, d3}, [r14], r2 @ row5 |
| |
| vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 |
| |
| vld1.8 {d6, d7}, [r14], r2 @row6 |
| |
| vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1 |
| |
| |
| |
| vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5 |
| |
| |
| |
| vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2 |
| |
| vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6 |
| |
| |
| |
| |
| vld1.8 {d8, d9}, [r1], r2 @load row3 |
| |
| |
| |
| vld1.8 {d10, d11}, [r14], r2 @load row7 |
| |
| vld1.8 {d12, d13}, [r1], r2 @load row4 |
| |
| vld1.8 {d14, d15}, [r14], r2 @load row8 |
| |
| vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3 |
| |
| vld1.8 {d16, d17}, [r14], r2 @load row9 |
| |
| |
| |
| |
| |
| vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7 |
| |
| |
| |
| vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4 |
| |
| |
| |
| vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8 |
| |
| vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9 |
| |
| |
| @interpolation in x direction |
| |
| vaddl.u8 q0, d0, d1 @operate row1 |
| |
| vaddl.u8 q1, d2, d3 @operate row5 |
| |
| vaddl.u8 q2, d4, d5 @operate row2 |
| |
| vaddl.u8 q3, d6, d7 @operate row6 |
| |
| vaddl.u8 q4, d8, d9 @operate row3 |
| |
| vaddl.u8 q5, d10, d11 @operate row7 |
| |
| vaddl.u8 q6, d12, d13 @operate row4 |
| |
| vaddl.u8 q7, d14, d15 @operate row8 |
| |
| vaddl.u8 q8, d16, d17 @operate row9 |
| |
| @interpolation in y direction |
| |
| add r14, r0, r3, lsl #2 |
| |
| |
| |
| vadd.u16 q9, q0, q2 @operate row1 and row2 |
| |
| vadd.u16 q13, q1, q3 @operate row5 and row6 |
| |
| vadd.u16 q10, q2, q4 @operate row2 and row3 |
| |
| vadd.u16 q14, q3, q5 @operate row6 and row7 |
| |
| vrshrn.u16 d18, q9, #2 @row1 |
| |
| vrshrn.u16 d26, q13, #2 @row5 |
| |
| vrshrn.u16 d20, q10, #2 @row2 |
| |
| vrshrn.u16 d28, q14, #2 @row6 |
| |
| vadd.u16 q11, q4, q6 @operate row3 and row4 |
| |
| vst1.8 d18, [r0], r3 @store row1 |
| |
| vadd.u16 q15, q5, q7 @operate row7 and row8 |
| |
| vst1.8 d26, [r14], r3 @store row5 |
| |
| vadd.u16 q12, q6, q1 @operate row4 and row5 |
| |
| vst1.8 d20, [r0], r3 @store row2 |
| |
| vadd.u16 q7, q7, q8 @operate row8 and row9 |
| |
| vst1.8 d28, [r14], r3 @store row6 |
| |
| |
| |
| vrshrn.u16 d22, q11, #2 @row3 |
| |
| vrshrn.u16 d30, q15, #2 @row7 |
| |
| vrshrn.u16 d24, q12, #2 @row4 |
| |
| vrshrn.u16 d14, q7, #2 @row8 |
| |
| |
| vst1.8 d22, [r0], r3 @store row3 |
| vst1.8 d30, [r14], r3 @store row7 |
| vst1.8 d24, [r0], r3 @store row4 |
| vst1.8 d14, [r14], r3 @store row8 |
| |
| |
| |
| ldmfd sp!, {pc} |
| |
| |
| |
| |
| |
| @/* |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_mc_fullx_fully_8x8_a9q() |
| @// |
| @// Detail Description : This function pastes the reference block in the |
| @// current frame buffer.This function is called for |
| @// blocks that are not coded and have motion vectors |
| @// with a half pel resolutionand .. |
| @// |
| @// Inputs : r0 - out : Current Block Pointer |
| @// r1 - ref : Refernce Block Pointer |
| @// r2 - ref_wid : Refernce Block Width |
| @// r3 - out_wid ; Current Block Width |
| @// |
| @// Registers Used : r12, r14, d0-d3 |
| |
| @// |
| @// Stack Usage : 8 bytes |
| @// |
| @// Outputs : The Motion Compensated Block |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| |
| .global impeg2_mc_fullx_fully_8x8_a9q |
| impeg2_mc_fullx_fully_8x8_a9q: |
| |
| |
| stmfd sp!, {r12, lr} |
| |
| add r14, r1, r2, lsl #2 |
| |
| add r12, r0, r3, lsl #2 |
| |
| |
| vld1.8 d0, [r1], r2 @load row1 |
| |
| vld1.8 d1, [r14], r2 @load row4 |
| |
| vld1.8 d2, [r1], r2 @load row2 |
| |
| vld1.8 d3, [r14], r2 @load row5 |
| |
| |
| vst1.8 d0, [r0], r3 @store row1 |
| |
| vst1.8 d1, [r12], r3 @store row4 |
| |
| vst1.8 d2, [r0], r3 @store row2 |
| |
| vst1.8 d3, [r12], r3 @store row5 |
| |
| |
| vld1.8 d0, [r1], r2 @load row3 |
| |
| vld1.8 d1, [r14], r2 @load row6 |
| |
| vld1.8 d2, [r1], r2 @load row4 |
| |
| vld1.8 d3, [r14], r2 @load row8 |
| |
| |
| vst1.8 d0, [r0], r3 @store row3 |
| |
| vst1.8 d1, [r12], r3 @store row6 |
| |
| vst1.8 d2, [r0], r3 @store row4 |
| |
| vst1.8 d3, [r12], r3 @store row8 |
| |
| |
| ldmfd sp!, {r12, pc} |
| |
| |
| |
| |
| |
| @/* |
| @//--------------------------------------------------------------------------- |
| @// Function Name : impeg2_interpolate_a9q() |
| @// |
| @// Detail Description : interpolates two buffers and adds pred |
| @// |
| @// Inputs : r0 - pointer to src1 |
| @// r1 - pointer to src2 |
| @// r2 - dest buf |
| @// r3 - dst stride |
| @// Registers Used : r4, r5, r7, r14, d0-d15 |
| @// |
| @// Stack Usage : 20 bytes |
| @// |
| @// Outputs : The Motion Compensated Block |
| @// |
| @// Return Data : None |
| @// |
| @// Programming Note : <program limitation> |
| @//----------------------------------------------------------------------------- |
| @*/ |
| |
| |
| .global impeg2_interpolate_a9q |
| |
| |
| impeg2_interpolate_a9q: |
| |
| stmfd r13!, {r4, r5, r7, r12, r14} |
| |
| ldr r4, [r0, #0] @ptr_y src1 |
| |
| ldr r5, [r1, #0] @ptr_y src2 |
| |
| ldr r7, [r2, #0] @ptr_y dst buf |
| |
| mov r12, #4 @counter for number of blocks |
| |
| |
| interp_lumablocks_stride: |
| |
| vld1.8 {d0, d1}, [r4]! @row1 src1 |
| |
| vld1.8 {d2, d3}, [r4]! @row2 src1 |
| |
| vld1.8 {d4, d5}, [r4]! @row3 src1 |
| |
| vld1.8 {d6, d7}, [r4]! @row4 src1 |
| |
| |
| vld1.8 {d8, d9}, [r5]! @row1 src2 |
| |
| vld1.8 {d10, d11}, [r5]! @row2 src2 |
| |
| vld1.8 {d12, d13}, [r5]! @row3 src2 |
| |
| vld1.8 {d14, d15}, [r5]! @row4 src2 |
| |
| |
| |
| |
| vrhadd.u8 q0, q0, q4 @operate on row1 |
| |
| vrhadd.u8 q1, q1, q5 @operate on row2 |
| |
| vrhadd.u8 q2, q2, q6 @operate on row3 |
| |
| vrhadd.u8 q3, q3, q7 @operate on row4 |
| |
| |
| |
| vst1.8 {d0, d1}, [r7], r3 @row1 |
| |
| vst1.8 {d2, d3}, [r7], r3 @row2 |
| |
| vst1.8 {d4, d5}, [r7], r3 @row3 |
| |
| vst1.8 {d6, d7}, [r7], r3 @row4 |
| |
| subs r12, r12, #1 |
| |
| bne interp_lumablocks_stride |
| |
| |
| mov r3, r3, lsr #1 @stride >> 1 |
| |
| ldr r4, [r0, #4] @ptr_u src1 |
| |
| ldr r5, [r1, #4] @ptr_u src2 |
| |
| ldr r7 , [r2, #4] @ptr_u dst buf |
| |
| mov r12, #2 @counter for number of blocks |
| |
| |
| |
| @chroma blocks |
| |
| interp_chromablocks_stride: |
| |
| vld1.8 {d0, d1}, [r4]! @row1 & 2 src1 |
| |
| vld1.8 {d2, d3}, [r4]! @row3 & 4 src1 |
| |
| vld1.8 {d4, d5}, [r4]! @row5 & 6 src1 |
| |
| vld1.8 {d6, d7}, [r4]! @row7 & 8 src1 |
| |
| |
| vld1.8 {d8, d9}, [r5]! @row1 & 2 src2 |
| |
| vld1.8 {d10, d11}, [r5]! @row3 & 4 src2 |
| |
| vld1.8 {d12, d13}, [r5]! @row5 & 6 src2 |
| |
| vld1.8 {d14, d15}, [r5]! @row7 & 8 src2 |
| |
| |
| |
| |
| vrhadd.u8 q0, q0, q4 @operate on row1 & 2 |
| |
| vrhadd.u8 q1, q1, q5 @operate on row3 & 4 |
| |
| vrhadd.u8 q2, q2, q6 @operate on row5 & 6 |
| |
| vrhadd.u8 q3, q3, q7 @operate on row7 & 8 |
| |
| |
| vst1.8 {d0}, [r7], r3 @row1 |
| |
| vst1.8 {d1}, [r7], r3 @row2 |
| |
| vst1.8 {d2}, [r7], r3 @row3 |
| |
| vst1.8 {d3}, [r7], r3 @row4 |
| |
| vst1.8 {d4}, [r7], r3 @row5 |
| |
| vst1.8 {d5}, [r7], r3 @row6 |
| |
| vst1.8 {d6}, [r7], r3 @row7 |
| |
| vst1.8 {d7}, [r7], r3 @row8 |
| |
| |
| |
| ldr r4, [r0, #8] @ptr_v src1 |
| |
| ldr r5, [r1, #8] @ptr_v src2 |
| |
| ldr r7, [r2, #8] @ptr_v dst buf |
| |
| subs r12, r12, #1 |
| |
| bne interp_chromablocks_stride |
| |
| |
| ldmfd r13!, {r4, r5, r7, r12, pc} |
| |
| |
| |
| |
| |