| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| |
| ///* |
| ////---------------------------------------------------------------------------- |
| //// File Name : impeg2_inter_pred.s |
| //// |
| //// Description : This file has motion compensation related |
| //// interpolation functions on Neon + CortexA-8 platform |
| //// |
| //// Reference Document : |
| //// |
| //// Revision History : |
| //// Date Author Detail Description |
| //// ------------ ---------------- ---------------------------------- |
| //// 18 jun 2010 S Hamsalekha Created |
| //// |
| ////------------------------------------------------------------------------- |
| //*/ |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Include Files |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| // PRESERVE8 |
| .text |
| .include "impeg2_neon_macros.s" |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Struct/Union Types and Define |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Static Global Data section variables |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| //// -------------------------- NONE -------------------------------------------- |
| |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Static Prototype Functions |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| //// -------------------------- NONE -------------------------------------------- |
| |
| ///* |
| //// ---------------------------------------------------------------------------- |
| //// Exported functions |
| //// ---------------------------------------------------------------------------- |
| //*/ |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_copy_mb_av8() |
| //// |
| //// Detail Description : Copies one MB worth of data from src to the dst |
| //// |
| //// Inputs : x0 - pointer to src |
| //// x1 - pointer to dst |
| //// x2 - source width |
| //// x3 - destination width |
| //// Registers Used : v0, v1 |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| |
| |
| .global impeg2_copy_mb_av8 |
| |
| |
| impeg2_copy_mb_av8: |
| |
| //STMFD x13!,{x4,x5,x12,x14} |
| push_v_regs |
| |
| |
| ldr x4, [x0] //src->y |
| ldr x5, [x1] //dst->y |
| |
| //Read one row of data from the src |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| |
| ////Repeat 15 times for y |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst |
| |
| lsr x2, x2, #1 //src_offset /= 2 |
| lsr x3, x3, #1 //dst_offset /= 2 |
| |
| ldr x4, [x0, #8] //src->u |
| ldr x5, [x1, #8] //dst->u |
| |
| //Read one row of data from the src |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| |
| ////Repeat 7 times for u |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| |
| ldr x4, [x0, #16] //src->v |
| ldr x5, [x1, #16] //dst->v |
| |
| //Read one row of data from the src |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| |
| ////Repeat 7 times for v |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| ld1 {v0.8b}, [x4], x2 //Load and increment src |
| st1 {v0.8b}, [x5], x3 //Store and increment dst |
| |
| //LDMFD x13!,{x4,x5,x12,PC} |
| pop_v_regs |
| ret |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_mc_fullx_halfy_8x8_av8() |
| //// |
| //// Detail Description : This function pastes the reference block in the |
| //// current frame buffer.This function is called for |
| //// blocks that are not coded and have motion vectors |
| //// with a half pel resolution. |
| //// |
| //// Inputs : x0 - out : Current Block Pointer |
| //// x1 - ref : Refernce Block Pointer |
| //// x2 - ref_wid : Refernce Block Width |
| //// x3 - out_wid @ Current Block Width |
| //// |
| //// Registers Used : x14, D0-D9 |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : The Motion Compensated Block |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| .global impeg2_mc_fullx_halfy_8x8_av8 |
| |
| impeg2_mc_fullx_halfy_8x8_av8: |
| |
| //STMFD x13!,{x12,x14} |
| push_v_regs |
| add x14, x1, x2 |
| lsl x2, x2, #1 |
| |
| ///* Load 8 + 1 rows from reference block */ |
| ///* Do the addition with out rounding off as rounding value is 1 */ |
| ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0 |
| ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2 |
| ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4 |
| ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6 |
| ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1 |
| ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3 |
| urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9 |
| ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5 |
| urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1 |
| urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1 |
| ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7 |
| urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3 |
| urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3 |
| ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8 |
| urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5 |
| urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5 |
| |
| add x14, x0, x3 |
| lsl x3, x3, #1 |
| |
| ///* Store the eight rows calculated above */ |
| st1 {v2.8b}, [x14], x3 //// second row hence D2 |
| urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7 |
| st1 {v0.8b}, [x0], x3 //// first row hence D0 |
| st1 {v9.8b}, [x14], x3 //// fourth row hence D9 |
| st1 {v4.8b}, [x0], x3 //// third row hence D4 |
| st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3 |
| st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1 |
| st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7 |
| st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5 |
| |
| // LDMFD sp!,{x12,pc} |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_mc_halfx_fully_8x8_av8() |
| //// |
| //// Detail Description : This function pastes the reference block in the |
| //// current frame buffer.This function is called for |
| //// blocks that are not coded and have motion vectors |
| //// with a half pel resolutionand VopRoundingType is 0 .. |
| //// |
| //// Inputs : x0 - out : Current Block Pointer |
| //// x1 - ref : Refernce Block Pointer |
| //// x2 - ref_wid : Refernce Block Width |
| //// x3 - out_wid @ Current Block Width |
| //// |
| //// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22 |
| |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : The Motion Compensated Block |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| |
| |
| .global impeg2_mc_halfx_fully_8x8_av8 |
| |
| |
| |
| impeg2_mc_halfx_fully_8x8_av8: |
| |
| // STMFD sp!,{x12,x14} |
| push_v_regs |
| |
| add x14, x1, x2, lsl #2 |
| |
| add x12, x0, x3, lsl#2 |
| |
| ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 |
| |
| ld1 {v2.8b, v3.8b}, [x14], x2 // row5 |
| |
| |
| ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 |
| |
| ld1 {v6.8b, v7.8b}, [x14], x2 //row6 |
| |
| |
| ext v8.8b, v0.8b , v1.8b , #1 |
| |
| ext v12.8b, v2.8b , v3.8b , #1 |
| |
| ext v16.8b, v4.8b , v5.8b , #1 |
| |
| ext v20.8b, v6.8b , v7.8b , #1 |
| |
| |
| ld1 {v9.8b, v10.8b}, [x1], x2 //load row3 |
| |
| ld1 {v13.8b, v14.8b}, [x14], x2 //load row7 |
| |
| ld1 {v17.8b, v18.8b}, [x1], x2 //load row4 |
| |
| ld1 {v21.8b, v22.8b}, [x14], x2 //load row8 |
| |
| |
| ext v1.8b, v9.8b , v10.8b , #1 |
| |
| ext v3.8b, v13.8b , v14.8b , #1 |
| |
| |
| |
| ext v5.8b, v17.8b , v18.8b , #1 |
| |
| ext v7.8b, v21.8b , v22.8b , #1 |
| |
| |
| urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3 |
| urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3 |
| |
| urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7 |
| urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7 |
| |
| |
| urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4 |
| urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4 |
| |
| |
| urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8 |
| urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8 |
| |
| st1 {v0.8b}, [x0], x3 //store row1 |
| |
| st1 {v2.8b}, [x12], x3 //store row5 |
| |
| st1 {v4.8b}, [x0], x3 //store row2 |
| |
| st1 {v6.8b}, [x12], x3 //store row6 |
| |
| st1 {v1.8b}, [x0], x3 //store row3 |
| |
| st1 {v3.8b}, [x12], x3 //store row7 |
| |
| st1 {v5.8b}, [x0], x3 //store row4 |
| |
| st1 {v7.8b}, [x12], x3 //store row8 |
| |
| |
| |
| // LDMFD sp!,{x12,pc} |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_mc_halfx_halfy_8x8_av8() |
| //// |
| //// Detail Description : This function pastes the reference block in the |
| //// current frame buffer.This function is called for |
| //// blocks that are not coded and have motion vectors |
| //// with a half pel resolutionand VopRoundingType is 0 .. |
| //// |
| //// Inputs : x0 - out : Current Block Pointer |
| //// x1 - ref : Refernce Block Pointer |
| //// x2 - ref_wid : Refernce Block Width |
| //// x3 - out_wid @ Current Block Width |
| //// |
| //// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30 |
| |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : The Motion Compensated Block |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| |
| .global impeg2_mc_halfx_halfy_8x8_av8 |
| |
| impeg2_mc_halfx_halfy_8x8_av8: |
| |
| // STMFD sp!,{x12,x14} |
| push_v_regs |
| |
| add x14, x1, x2, lsl #2 |
| |
| ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 |
| |
| ld1 {v2.8b, v3.8b}, [x14], x2 // row5 |
| |
| ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 |
| |
| ld1 {v6.8b, v7.8b}, [x14], x2 //row6 |
| |
| ext v1.8b, v0.8b , v1.8b , #1 |
| |
| |
| |
| ext v3.8b, v2.8b , v3.8b , #1 |
| |
| |
| |
| ext v5.8b, v4.8b , v5.8b , #1 |
| |
| ext v7.8b, v6.8b , v7.8b , #1 |
| |
| |
| |
| |
| ld1 {v8.8b, v9.8b}, [x1], x2 //load row3 |
| |
| |
| |
| ld1 {v10.8b, v11.8b}, [x14], x2 //load row7 |
| |
| ld1 {v12.8b, v13.8b}, [x1], x2 //load row4 |
| |
| ld1 {v14.8b, v15.8b}, [x14], x2 //load row8 |
| |
| ext v9.8b, v8.8b , v9.8b , #1 |
| |
| ld1 {v16.8b, v17.8b}, [x14], x2 //load row9 |
| |
| |
| |
| |
| |
| ext v11.8b, v10.8b , v11.8b , #1 |
| |
| |
| |
| ext v13.8b, v12.8b , v13.8b , #1 |
| |
| |
| |
| ext v15.8b, v14.8b , v15.8b , #1 |
| |
| ext v17.8b, v16.8b , v17.8b , #1 |
| |
| |
| //interpolation in x direction |
| |
| uaddl v0.8h, v0.8b, v1.8b //operate row1 |
| |
| uaddl v2.8h, v2.8b, v3.8b //operate row5 |
| |
| uaddl v4.8h, v4.8b, v5.8b //operate row2 |
| |
| uaddl v6.8h, v6.8b, v7.8b //operate row6 |
| |
| uaddl v8.8h, v8.8b, v9.8b //operate row3 |
| |
| uaddl v10.8h, v10.8b, v11.8b //operate row7 |
| |
| uaddl v12.8h, v12.8b, v13.8b //operate row4 |
| |
| uaddl v14.8h, v14.8b, v15.8b //operate row8 |
| |
| uaddl v16.8h, v16.8b, v17.8b //operate row9 |
| |
| //interpolation in y direction |
| |
| add x14, x0, x3, lsl #2 |
| |
| |
| |
| add v18.8h, v0.8h , v4.8h //operate row1 and row2 |
| |
| add v26.8h, v2.8h , v6.8h //operate row5 and row6 |
| |
| add v20.8h, v4.8h , v8.8h //operate row2 and row3 |
| |
| add v28.8h, v6.8h , v10.8h //operate row6 and row7 |
| |
| rshrn v18.8b, v18.8h, #2 //row1 |
| |
| rshrn v26.8b, v26.8h, #2 //row5 |
| |
| rshrn v20.8b, v20.8h, #2 //row2 |
| |
| rshrn v28.8b, v28.8h, #2 //row6 |
| |
| add v22.8h, v8.8h , v12.8h //operate row3 and row4 |
| |
| st1 {v18.8b}, [x0], x3 //store row1 |
| |
| add v30.8h, v10.8h , v14.8h //operate row7 and row8 |
| |
| st1 {v26.8b}, [x14], x3 //store row5 |
| |
| add v24.8h, v12.8h , v2.8h //operate row4 and row5 |
| |
| st1 {v20.8b}, [x0], x3 //store row2 |
| |
| add v14.8h, v14.8h , v16.8h //operate row8 and row9 |
| |
| st1 {v28.8b}, [x14], x3 //store row6 |
| |
| |
| |
| rshrn v22.8b, v22.8h, #2 //row3 |
| |
| rshrn v30.8b, v30.8h, #2 //row7 |
| |
| rshrn v24.8b, v24.8h, #2 //row4 |
| |
| rshrn v14.8b, v14.8h, #2 //row8 |
| |
| |
| st1 {v22.8b}, [x0], x3 //store row3 |
| st1 {v30.8b}, [x14], x3 //store row7 |
| st1 {v24.8b}, [x0], x3 //store row4 |
| st1 {v14.8b}, [x14], x3 //store row8 |
| |
| |
| |
| // LDMFD sp!,{x12,pc} |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_mc_fullx_fully_8x8_av8() |
| //// |
| //// Detail Description : This function pastes the reference block in the |
| //// current frame buffer.This function is called for |
| //// blocks that are not coded and have motion vectors |
| //// with a half pel resolutionand .. |
| //// |
| //// Inputs : x0 - out : Current Block Pointer |
| //// x1 - ref : Refernce Block Pointer |
| //// x2 - ref_wid : Refernce Block Width |
| //// x3 - out_wid @ Current Block Width |
| //// |
| //// Registers Used : x12, x14, v0-v3 |
| |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : The Motion Compensated Block |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| |
| .global impeg2_mc_fullx_fully_8x8_av8 |
| impeg2_mc_fullx_fully_8x8_av8: |
| |
| |
| // STMFD sp!,{x12,x14} |
| push_v_regs |
| |
| add x14, x1, x2, lsl #2 |
| |
| add x12, x0, x3, lsl #2 |
| |
| |
| ld1 {v0.8b}, [x1], x2 //load row1 |
| |
| ld1 {v1.8b}, [x14], x2 //load row4 |
| |
| ld1 {v2.8b}, [x1], x2 //load row2 |
| |
| ld1 {v3.8b}, [x14], x2 //load row5 |
| |
| |
| st1 {v0.8b}, [x0], x3 //store row1 |
| |
| st1 {v1.8b}, [x12], x3 //store row4 |
| |
| st1 {v2.8b}, [x0], x3 //store row2 |
| |
| st1 {v3.8b}, [x12], x3 //store row5 |
| |
| |
| ld1 {v0.8b}, [x1], x2 //load row3 |
| |
| ld1 {v1.8b}, [x14], x2 //load row6 |
| |
| ld1 {v2.8b}, [x1], x2 //load row4 |
| |
| ld1 {v3.8b}, [x14], x2 //load row8 |
| |
| |
| st1 {v0.8b}, [x0], x3 //store row3 |
| |
| st1 {v1.8b}, [x12], x3 //store row6 |
| |
| st1 {v2.8b}, [x0], x3 //store row4 |
| |
| st1 {v3.8b}, [x12], x3 //store row8 |
| |
| |
| // LDMFD sp!,{x12,pc} |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| ///* |
| ////--------------------------------------------------------------------------- |
| //// Function Name : impeg2_interpolate_av8() |
| //// |
| //// Detail Description : interpolates two buffers and adds pred |
| //// |
| //// Inputs : x0 - pointer to src1 |
| //// x1 - pointer to src2 |
| //// x2 - dest buf |
| //// x3 - dst stride |
| //// Registers Used : x12, v0-v15 |
| //// |
| //// Stack Usage : 64 bytes |
| //// |
| //// Outputs : The Motion Compensated Block |
| //// |
| //// Return Data : None |
| //// |
| //// Programming Note : <program limitation> |
| ////----------------------------------------------------------------------------- |
| //*/ |
| |
| |
| .global impeg2_interpolate_av8 |
| |
| |
| impeg2_interpolate_av8: |
| |
| //STMFD x13!,{x4-x7,x12,x14} |
| push_v_regs |
| |
| ldr x4, [x0, #0] //ptr_y src1 |
| |
| ldr x5, [x1, #0] //ptr_y src2 |
| |
| ldr x7, [x2, #0] //ptr_y dst buf |
| |
| mov x12, #4 //counter for number of blocks |
| |
| |
| interp_lumablocks_stride: |
| ld1 {v0.16b}, [x4], #16 //row1 src1 |
| |
| ld1 {v2.16b}, [x4], #16 //row2 src1 |
| |
| ld1 {v4.16b}, [x4], #16 //row3 src1 |
| |
| ld1 {v6.16b}, [x4], #16 //row4 src1 |
| |
| |
| ld1 {v8.16b}, [x5], #16 //row1 src2 |
| |
| ld1 {v10.16b}, [x5], #16 //row2 src2 |
| |
| ld1 {v12.16b}, [x5], #16 //row3 src2 |
| |
| ld1 {v14.16b}, [x5], #16 //row4 src2 |
| |
| urhadd v0.16b, v0.16b , v8.16b //operate on row1 |
| |
| urhadd v2.16b, v2.16b , v10.16b //operate on row2 |
| |
| urhadd v4.16b, v4.16b , v12.16b //operate on row3 |
| |
| urhadd v6.16b, v6.16b , v14.16b //operate on row4 |
| st1 {v0.16b}, [x7], x3 //row1 |
| |
| st1 {v2.16b}, [x7], x3 //row2 |
| |
| st1 {v4.16b}, [x7], x3 //row3 |
| |
| st1 {v6.16b}, [x7], x3 //row4 |
| |
| subs x12, x12, #1 |
| |
| bne interp_lumablocks_stride |
| |
| |
| lsr x3, x3, #1 //stride >> 1 |
| |
| ldr x4, [x0, #8] //ptr_u src1 |
| |
| ldr x5, [x1, #8] //ptr_u src2 |
| |
| ldr x7 , [x2, #8] //ptr_u dst buf |
| |
| mov x12, #2 //counter for number of blocks |
| |
| |
| |
| //chroma blocks |
| |
| interp_chromablocks_stride: |
| ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1 |
| |
| ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1 |
| |
| ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1 |
| |
| ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1 |
| |
| |
| ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2 |
| |
| ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2 |
| |
| ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2 |
| |
| ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2 |
| |
| urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2 |
| urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2 |
| |
| urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4 |
| urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4 |
| |
| urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6 |
| urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6 |
| |
| urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8 |
| urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8 |
| |
| st1 {v0.8b}, [x7], x3 //row1 |
| |
| st1 {v1.8b}, [x7], x3 //row2 |
| |
| st1 {v2.8b}, [x7], x3 //row3 |
| |
| st1 {v3.8b}, [x7], x3 //row4 |
| |
| st1 {v4.8b}, [x7], x3 //row5 |
| |
| st1 {v5.8b}, [x7], x3 //row6 |
| |
| st1 {v6.8b}, [x7], x3 //row7 |
| |
| st1 {v7.8b}, [x7], x3 //row8 |
| |
| |
| ldr x4, [x0, #16] //ptr_v src1 |
| |
| ldr x5, [x1, #16] //ptr_v src2 |
| |
| ldr x7, [x2, #16] //ptr_v dst buf |
| |
| subs x12, x12, #1 |
| |
| bne interp_chromablocks_stride |
| |
| |
| //LDMFD x13!,{x4-x7,x12,PC} |
| pop_v_regs |
| ret |
| |
| |
| |
| |