blob: f1b3dde878e6c2d970615eb1c04219678caca932 [file] [log] [blame]
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@/*
@//----------------------------------------------------------------------------
@// File Name : impeg2_inter_pred.s
@//
@// Description : This file has motion compensation related
@// interpolation functions on Neon + CortexA-8 platform
@//
@// Reference Document :
@//
@// Revision History :
@// Date Author Detail Description
@// ------------ ---------------- ----------------------------------
@// 18 jun 2010 S Hamsalekha Created
@//
@//-------------------------------------------------------------------------
@*/
@/*
@// ----------------------------------------------------------------------------
@// Include Files
@// ----------------------------------------------------------------------------
@*/
.text
.p2align 2
@/*
@// ----------------------------------------------------------------------------
@// Struct/Union Types and Define
@// ----------------------------------------------------------------------------
@*/
@/*
@// ----------------------------------------------------------------------------
@// Static Global Data section variables
@// ----------------------------------------------------------------------------
@*/
@// -------------------------- NONE --------------------------------------------
@/*
@// ----------------------------------------------------------------------------
@// Static Prototype Functions
@// ----------------------------------------------------------------------------
@*/
@// -------------------------- NONE --------------------------------------------
@/*
@// ----------------------------------------------------------------------------
@// Exported functions
@// ----------------------------------------------------------------------------
@*/
@//---------------------------------------------------------------------------
@// Function Name : impeg2_copy_mb_a9q()
@//
@// Detail Description : Copies one MB worth of data from src to the dst
@//
@// Inputs : r0 - pointer to src
@// r1 - pointer to dst
@// r2 - source width
@// r3 - destination width
@// Registers Used : r4, r5, d0, d1
@//
@// Stack Usage : 12 bytes
@//
@// Outputs :
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_copy_mb_a9q
impeg2_copy_mb_a9q:
stmfd r13!, {r4, r5, r14}
ldr r4, [r0] @src->y
ldr r5, [r1] @dst->y
@Read one row of data from the src
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
@//Repeat 15 times for y
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
mov r2, r2, lsr #1 @src_offset /= 2
mov r3, r3, lsr #1 @dst_offset /= 2
ldr r4, [r0, #4] @src->u
ldr r5, [r1, #4] @dst->u
@Read one row of data from the src
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
@//Repeat 7 times for u
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
ldr r4, [r0, #8] @src->v
ldr r5, [r1, #8] @dst->v
@Read one row of data from the src
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
@//Repeat 7 times for v
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
vld1.8 {d0}, [r4], r2 @Load and increment src
vst1.8 {d0}, [r5], r3 @Store and increment dst
ldmfd r13!, {r4, r5, pc}
@/*
@//---------------------------------------------------------------------------
@// Function Name : impeg2_mc_fullx_halfy_8x8_a9q()
@//
@// Detail Description : This function pastes the reference block in the
@// current frame buffer.This function is called for
@// blocks that are not coded and have motion vectors
@// with a half pel resolution.
@//
@// Inputs : r0 - out : Current Block Pointer
@// r1 - ref : Refernce Block Pointer
@// r2 - ref_wid : Refernce Block Width
@// r3 - out_wid ; Current Block Width
@//
@// Registers Used : D0-D9
@//
@// Stack Usage : 4 bytes
@//
@// Outputs : The Motion Compensated Block
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_mc_fullx_halfy_8x8_a9q
impeg2_mc_fullx_halfy_8x8_a9q:
stmfd r13!, {r14}
add r14, r1, r2
mov r2, r2, lsl #1
@/* Load 8 + 1 rows from reference block */
@/* Do the addition with out rounding off as rounding value is 1 */
vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0
vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2
vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4
vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6
vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1
vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3
vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9
vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5
vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1
vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7
vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3
vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8
vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5
add r14, r0, r3
mov r3, r3, lsl #1
@/* Store the eight rows calculated above */
vst1.8 {d2}, [r14], r3 @// second row hence D2
vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7
vst1.8 {d0}, [r0], r3 @// first row hence D0
vst1.8 {d9}, [r14], r3 @// fourth row hence D9
vst1.8 {d4}, [r0], r3 @// third row hence D4
vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3
vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1
vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7
vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5
ldmfd sp!, {pc}
@/*
@//---------------------------------------------------------------------------
@// Function Name : impeg2_mc_halfx_fully_8x8_a9q()
@//
@// Detail Description : This function pastes the reference block in the
@// current frame buffer.This function is called for
@// blocks that are not coded and have motion vectors
@// with a half pel resolutionand VopRoundingType is 0 ..
@//
@// Inputs : r0 - out : Current Block Pointer
@// r1 - ref : Refernce Block Pointer
@// r2 - ref_wid : Refernce Block Width
@// r3 - out_wid ; Current Block Width
@//
@// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
@//
@// Stack Usage : 8 bytes
@//
@// Outputs : The Motion Compensated Block
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_mc_halfx_fully_8x8_a9q
impeg2_mc_halfx_fully_8x8_a9q:
stmfd sp!, {r12, lr}
add r14, r1, r2, lsl #2
add r12, r0, r3, lsl#2
vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
vld1.8 {d2, d3}, [r14], r2 @ row5
vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
vld1.8 {d6, d7}, [r14], r2 @row6
vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1
vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5
vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2
vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6
vld1.8 {d9, d10}, [r1], r2 @load row3
vld1.8 {d13, d14}, [r14], r2 @load row7
vld1.8 {d17, d18}, [r1], r2 @load row4
vld1.8 {d21, d22}, [r14], r2 @load row8
vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3
vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7
vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4
vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8
vrhadd.u8 q0, q0, q4 @operate on row1 and row3
vrhadd.u8 q1, q1, q6 @operate on row5 and row7
vrhadd.u8 q2, q2, q8 @operate on row2 and row4
vrhadd.u8 q3, q3, q10 @operate on row6 and row8
vst1.8 d0, [r0], r3 @store row1
vst1.8 d2, [r12], r3 @store row5
vst1.8 d4, [r0], r3 @store row2
vst1.8 d6, [r12], r3 @store row6
vst1.8 d1, [r0], r3 @store row3
vst1.8 d3, [r12], r3 @store row7
vst1.8 d5, [r0], r3 @store row4
vst1.8 d7, [r12], r3 @store row8
ldmfd sp!, {r12, pc}
@/*
@//---------------------------------------------------------------------------
@// Function Name : impeg2_mc_halfx_halfy_8x8_a9q()
@//
@// Detail Description : This function pastes the reference block in the
@// current frame buffer.This function is called for
@// blocks that are not coded and have motion vectors
@// with a half pel resolutionand VopRoundingType is 0 ..
@//
@// Inputs : r0 - out : Current Block Pointer
@// r1 - ref : Refernce Block Pointer
@// r2 - ref_wid : Refernce Block Width
@// r3 - out_wid ; Current Block Width
@//
@// Registers Used : r14, q0-q15
@//
@// Stack Usage : 4 bytes
@//
@// Outputs : The Motion Compensated Block
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_mc_halfx_halfy_8x8_a9q
impeg2_mc_halfx_halfy_8x8_a9q:
stmfd sp!, {r14}
add r14, r1, r2, lsl #2
vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
vld1.8 {d2, d3}, [r14], r2 @ row5
vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
vld1.8 {d6, d7}, [r14], r2 @row6
vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1
vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5
vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2
vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6
vld1.8 {d8, d9}, [r1], r2 @load row3
vld1.8 {d10, d11}, [r14], r2 @load row7
vld1.8 {d12, d13}, [r1], r2 @load row4
vld1.8 {d14, d15}, [r14], r2 @load row8
vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3
vld1.8 {d16, d17}, [r14], r2 @load row9
vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7
vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4
vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8
vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9
@interpolation in x direction
vaddl.u8 q0, d0, d1 @operate row1
vaddl.u8 q1, d2, d3 @operate row5
vaddl.u8 q2, d4, d5 @operate row2
vaddl.u8 q3, d6, d7 @operate row6
vaddl.u8 q4, d8, d9 @operate row3
vaddl.u8 q5, d10, d11 @operate row7
vaddl.u8 q6, d12, d13 @operate row4
vaddl.u8 q7, d14, d15 @operate row8
vaddl.u8 q8, d16, d17 @operate row9
@interpolation in y direction
add r14, r0, r3, lsl #2
vadd.u16 q9, q0, q2 @operate row1 and row2
vadd.u16 q13, q1, q3 @operate row5 and row6
vadd.u16 q10, q2, q4 @operate row2 and row3
vadd.u16 q14, q3, q5 @operate row6 and row7
vrshrn.u16 d18, q9, #2 @row1
vrshrn.u16 d26, q13, #2 @row5
vrshrn.u16 d20, q10, #2 @row2
vrshrn.u16 d28, q14, #2 @row6
vadd.u16 q11, q4, q6 @operate row3 and row4
vst1.8 d18, [r0], r3 @store row1
vadd.u16 q15, q5, q7 @operate row7 and row8
vst1.8 d26, [r14], r3 @store row5
vadd.u16 q12, q6, q1 @operate row4 and row5
vst1.8 d20, [r0], r3 @store row2
vadd.u16 q7, q7, q8 @operate row8 and row9
vst1.8 d28, [r14], r3 @store row6
vrshrn.u16 d22, q11, #2 @row3
vrshrn.u16 d30, q15, #2 @row7
vrshrn.u16 d24, q12, #2 @row4
vrshrn.u16 d14, q7, #2 @row8
vst1.8 d22, [r0], r3 @store row3
vst1.8 d30, [r14], r3 @store row7
vst1.8 d24, [r0], r3 @store row4
vst1.8 d14, [r14], r3 @store row8
ldmfd sp!, {pc}
@/*
@//---------------------------------------------------------------------------
@// Function Name : impeg2_mc_fullx_fully_8x8_a9q()
@//
@// Detail Description : This function pastes the reference block in the
@// current frame buffer.This function is called for
@// blocks that are not coded and have motion vectors
@// with a half pel resolutionand ..
@//
@// Inputs : r0 - out : Current Block Pointer
@// r1 - ref : Refernce Block Pointer
@// r2 - ref_wid : Refernce Block Width
@// r3 - out_wid ; Current Block Width
@//
@// Registers Used : r12, r14, d0-d3
@//
@// Stack Usage : 8 bytes
@//
@// Outputs : The Motion Compensated Block
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_mc_fullx_fully_8x8_a9q
impeg2_mc_fullx_fully_8x8_a9q:
stmfd sp!, {r12, lr}
add r14, r1, r2, lsl #2
add r12, r0, r3, lsl #2
vld1.8 d0, [r1], r2 @load row1
vld1.8 d1, [r14], r2 @load row4
vld1.8 d2, [r1], r2 @load row2
vld1.8 d3, [r14], r2 @load row5
vst1.8 d0, [r0], r3 @store row1
vst1.8 d1, [r12], r3 @store row4
vst1.8 d2, [r0], r3 @store row2
vst1.8 d3, [r12], r3 @store row5
vld1.8 d0, [r1], r2 @load row3
vld1.8 d1, [r14], r2 @load row6
vld1.8 d2, [r1], r2 @load row4
vld1.8 d3, [r14], r2 @load row8
vst1.8 d0, [r0], r3 @store row3
vst1.8 d1, [r12], r3 @store row6
vst1.8 d2, [r0], r3 @store row4
vst1.8 d3, [r12], r3 @store row8
ldmfd sp!, {r12, pc}
@/*
@//---------------------------------------------------------------------------
@// Function Name : impeg2_interpolate_a9q()
@//
@// Detail Description : interpolates two buffers and adds pred
@//
@// Inputs : r0 - pointer to src1
@// r1 - pointer to src2
@// r2 - dest buf
@// r3 - dst stride
@// Registers Used : r4, r5, r7, r14, d0-d15
@//
@// Stack Usage : 20 bytes
@//
@// Outputs : The Motion Compensated Block
@//
@// Return Data : None
@//
@// Programming Note : <program limitation>
@//-----------------------------------------------------------------------------
@*/
.global impeg2_interpolate_a9q
impeg2_interpolate_a9q:
stmfd r13!, {r4, r5, r7, r12, r14}
ldr r4, [r0, #0] @ptr_y src1
ldr r5, [r1, #0] @ptr_y src2
ldr r7, [r2, #0] @ptr_y dst buf
mov r12, #4 @counter for number of blocks
interp_lumablocks_stride:
vld1.8 {d0, d1}, [r4]! @row1 src1
vld1.8 {d2, d3}, [r4]! @row2 src1
vld1.8 {d4, d5}, [r4]! @row3 src1
vld1.8 {d6, d7}, [r4]! @row4 src1
vld1.8 {d8, d9}, [r5]! @row1 src2
vld1.8 {d10, d11}, [r5]! @row2 src2
vld1.8 {d12, d13}, [r5]! @row3 src2
vld1.8 {d14, d15}, [r5]! @row4 src2
vrhadd.u8 q0, q0, q4 @operate on row1
vrhadd.u8 q1, q1, q5 @operate on row2
vrhadd.u8 q2, q2, q6 @operate on row3
vrhadd.u8 q3, q3, q7 @operate on row4
vst1.8 {d0, d1}, [r7], r3 @row1
vst1.8 {d2, d3}, [r7], r3 @row2
vst1.8 {d4, d5}, [r7], r3 @row3
vst1.8 {d6, d7}, [r7], r3 @row4
subs r12, r12, #1
bne interp_lumablocks_stride
mov r3, r3, lsr #1 @stride >> 1
ldr r4, [r0, #4] @ptr_u src1
ldr r5, [r1, #4] @ptr_u src2
ldr r7 , [r2, #4] @ptr_u dst buf
mov r12, #2 @counter for number of blocks
@chroma blocks
interp_chromablocks_stride:
vld1.8 {d0, d1}, [r4]! @row1 & 2 src1
vld1.8 {d2, d3}, [r4]! @row3 & 4 src1
vld1.8 {d4, d5}, [r4]! @row5 & 6 src1
vld1.8 {d6, d7}, [r4]! @row7 & 8 src1
vld1.8 {d8, d9}, [r5]! @row1 & 2 src2
vld1.8 {d10, d11}, [r5]! @row3 & 4 src2
vld1.8 {d12, d13}, [r5]! @row5 & 6 src2
vld1.8 {d14, d15}, [r5]! @row7 & 8 src2
vrhadd.u8 q0, q0, q4 @operate on row1 & 2
vrhadd.u8 q1, q1, q5 @operate on row3 & 4
vrhadd.u8 q2, q2, q6 @operate on row5 & 6
vrhadd.u8 q3, q3, q7 @operate on row7 & 8
vst1.8 {d0}, [r7], r3 @row1
vst1.8 {d1}, [r7], r3 @row2
vst1.8 {d2}, [r7], r3 @row3
vst1.8 {d3}, [r7], r3 @row4
vst1.8 {d4}, [r7], r3 @row5
vst1.8 {d5}, [r7], r3 @row6
vst1.8 {d6}, [r7], r3 @row7
vst1.8 {d7}, [r7], r3 @row8
ldr r4, [r0, #8] @ptr_v src1
ldr r5, [r1, #8] @ptr_v src2
ldr r7, [r2, #8] @ptr_v dst buf
subs r12, r12, #1
bne interp_chromablocks_stride
ldmfd r13!, {r4, r5, r7, r12, pc}