blob: 98ade457cd1e6d02ec6a8a681d60804d040b16cb [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///*
////----------------------------------------------------------------------------
//// File Name : impeg2_inter_pred.s
////
//// Description : This file has motion compensation related
//// interpolation functions on Neon + CortexA-8 platform
////
//// Reference Document :
////
//// Revision History :
//// Date Author Detail Description
//// ------------ ---------------- ----------------------------------
//// 18 jun 2010 S Hamsalekha Created
////
////-------------------------------------------------------------------------
//*/
///*
//// ----------------------------------------------------------------------------
//// Include Files
//// ----------------------------------------------------------------------------
//*/
// PRESERVE8
.text
.include "impeg2_neon_macros.s"
///*
//// ----------------------------------------------------------------------------
//// Struct/Union Types and Define
//// ----------------------------------------------------------------------------
//*/
///*
//// ----------------------------------------------------------------------------
//// Static Global Data section variables
//// ----------------------------------------------------------------------------
//*/
//// -------------------------- NONE --------------------------------------------
///*
//// ----------------------------------------------------------------------------
//// Static Prototype Functions
//// ----------------------------------------------------------------------------
//*/
//// -------------------------- NONE --------------------------------------------
///*
//// ----------------------------------------------------------------------------
//// Exported functions
//// ----------------------------------------------------------------------------
//*/
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_copy_mb_av8()
////
//// Detail Description : Copies one MB worth of data from src to the dst
////
//// Inputs : x0 - pointer to src
//// x1 - pointer to dst
//// x2 - source width
//// x3 - destination width
//// Registers Used : v0, v1
////
//// Stack Usage : 64 bytes
////
//// Outputs :
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_copy_mb_av8
impeg2_copy_mb_av8:
//STMFD x13!,{x4,x5,x12,x14}
push_v_regs
ldr x4, [x0] //src->y
ldr x5, [x1] //dst->y
//Read one row of data from the src
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
////Repeat 15 times for y
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
lsr x2, x2, #1 //src_offset /= 2
lsr x3, x3, #1 //dst_offset /= 2
ldr x4, [x0, #8] //src->u
ldr x5, [x1, #8] //dst->u
//Read one row of data from the src
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
////Repeat 7 times for u
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ldr x4, [x0, #16] //src->v
ldr x5, [x1, #16] //dst->v
//Read one row of data from the src
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
////Repeat 7 times for v
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
ld1 {v0.8b}, [x4], x2 //Load and increment src
st1 {v0.8b}, [x5], x3 //Store and increment dst
//LDMFD x13!,{x4,x5,x12,PC}
pop_v_regs
ret
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_mc_fullx_halfy_8x8_av8()
////
//// Detail Description : This function pastes the reference block in the
//// current frame buffer.This function is called for
//// blocks that are not coded and have motion vectors
//// with a half pel resolution.
////
//// Inputs : x0 - out : Current Block Pointer
//// x1 - ref : Refernce Block Pointer
//// x2 - ref_wid : Refernce Block Width
//// x3 - out_wid @ Current Block Width
////
//// Registers Used : x14, D0-D9
////
//// Stack Usage : 64 bytes
////
//// Outputs : The Motion Compensated Block
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_mc_fullx_halfy_8x8_av8
impeg2_mc_fullx_halfy_8x8_av8:
//STMFD x13!,{x12,x14}
push_v_regs
add x14, x1, x2
lsl x2, x2, #1
///* Load 8 + 1 rows from reference block */
///* Do the addition with out rounding off as rounding value is 1 */
ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0
ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2
ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4
ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6
ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1
ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3
urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5
urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7
urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8
urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
add x14, x0, x3
lsl x3, x3, #1
///* Store the eight rows calculated above */
st1 {v2.8b}, [x14], x3 //// second row hence D2
urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
st1 {v0.8b}, [x0], x3 //// first row hence D0
st1 {v9.8b}, [x14], x3 //// fourth row hence D9
st1 {v4.8b}, [x0], x3 //// third row hence D4
st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3
st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1
st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7
st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5
// LDMFD sp!,{x12,pc}
pop_v_regs
ret
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_mc_halfx_fully_8x8_av8()
////
//// Detail Description : This function pastes the reference block in the
//// current frame buffer.This function is called for
//// blocks that are not coded and have motion vectors
//// with a half pel resolutionand VopRoundingType is 0 ..
////
//// Inputs : x0 - out : Current Block Pointer
//// x1 - ref : Refernce Block Pointer
//// x2 - ref_wid : Refernce Block Width
//// x3 - out_wid @ Current Block Width
////
//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
////
//// Stack Usage : 64 bytes
////
//// Outputs : The Motion Compensated Block
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_mc_halfx_fully_8x8_av8
impeg2_mc_halfx_fully_8x8_av8:
// STMFD sp!,{x12,x14}
push_v_regs
add x14, x1, x2, lsl #2
add x12, x0, x3, lsl#2
ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
ld1 {v2.8b, v3.8b}, [x14], x2 // row5
ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
ld1 {v6.8b, v7.8b}, [x14], x2 //row6
ext v8.8b, v0.8b , v1.8b , #1
ext v12.8b, v2.8b , v3.8b , #1
ext v16.8b, v4.8b , v5.8b , #1
ext v20.8b, v6.8b , v7.8b , #1
ld1 {v9.8b, v10.8b}, [x1], x2 //load row3
ld1 {v13.8b, v14.8b}, [x14], x2 //load row7
ld1 {v17.8b, v18.8b}, [x1], x2 //load row4
ld1 {v21.8b, v22.8b}, [x14], x2 //load row8
ext v1.8b, v9.8b , v10.8b , #1
ext v3.8b, v13.8b , v14.8b , #1
ext v5.8b, v17.8b , v18.8b , #1
ext v7.8b, v21.8b , v22.8b , #1
urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3
urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3
urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7
urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7
urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4
urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4
urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8
urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8
st1 {v0.8b}, [x0], x3 //store row1
st1 {v2.8b}, [x12], x3 //store row5
st1 {v4.8b}, [x0], x3 //store row2
st1 {v6.8b}, [x12], x3 //store row6
st1 {v1.8b}, [x0], x3 //store row3
st1 {v3.8b}, [x12], x3 //store row7
st1 {v5.8b}, [x0], x3 //store row4
st1 {v7.8b}, [x12], x3 //store row8
// LDMFD sp!,{x12,pc}
pop_v_regs
ret
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_mc_halfx_halfy_8x8_av8()
////
//// Detail Description : This function pastes the reference block in the
//// current frame buffer.This function is called for
//// blocks that are not coded and have motion vectors
//// with a half pel resolutionand VopRoundingType is 0 ..
////
//// Inputs : x0 - out : Current Block Pointer
//// x1 - ref : Refernce Block Pointer
//// x2 - ref_wid : Refernce Block Width
//// x3 - out_wid @ Current Block Width
////
//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30
////
//// Stack Usage : 64 bytes
////
//// Outputs : The Motion Compensated Block
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_mc_halfx_halfy_8x8_av8
impeg2_mc_halfx_halfy_8x8_av8:
// STMFD sp!,{x12,x14}
push_v_regs
add x14, x1, x2, lsl #2
ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
ld1 {v2.8b, v3.8b}, [x14], x2 // row5
ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
ld1 {v6.8b, v7.8b}, [x14], x2 //row6
ext v1.8b, v0.8b , v1.8b , #1
ext v3.8b, v2.8b , v3.8b , #1
ext v5.8b, v4.8b , v5.8b , #1
ext v7.8b, v6.8b , v7.8b , #1
ld1 {v8.8b, v9.8b}, [x1], x2 //load row3
ld1 {v10.8b, v11.8b}, [x14], x2 //load row7
ld1 {v12.8b, v13.8b}, [x1], x2 //load row4
ld1 {v14.8b, v15.8b}, [x14], x2 //load row8
ext v9.8b, v8.8b , v9.8b , #1
ld1 {v16.8b, v17.8b}, [x14], x2 //load row9
ext v11.8b, v10.8b , v11.8b , #1
ext v13.8b, v12.8b , v13.8b , #1
ext v15.8b, v14.8b , v15.8b , #1
ext v17.8b, v16.8b , v17.8b , #1
//interpolation in x direction
uaddl v0.8h, v0.8b, v1.8b //operate row1
uaddl v2.8h, v2.8b, v3.8b //operate row5
uaddl v4.8h, v4.8b, v5.8b //operate row2
uaddl v6.8h, v6.8b, v7.8b //operate row6
uaddl v8.8h, v8.8b, v9.8b //operate row3
uaddl v10.8h, v10.8b, v11.8b //operate row7
uaddl v12.8h, v12.8b, v13.8b //operate row4
uaddl v14.8h, v14.8b, v15.8b //operate row8
uaddl v16.8h, v16.8b, v17.8b //operate row9
//interpolation in y direction
add x14, x0, x3, lsl #2
add v18.8h, v0.8h , v4.8h //operate row1 and row2
add v26.8h, v2.8h , v6.8h //operate row5 and row6
add v20.8h, v4.8h , v8.8h //operate row2 and row3
add v28.8h, v6.8h , v10.8h //operate row6 and row7
rshrn v18.8b, v18.8h, #2 //row1
rshrn v26.8b, v26.8h, #2 //row5
rshrn v20.8b, v20.8h, #2 //row2
rshrn v28.8b, v28.8h, #2 //row6
add v22.8h, v8.8h , v12.8h //operate row3 and row4
st1 {v18.8b}, [x0], x3 //store row1
add v30.8h, v10.8h , v14.8h //operate row7 and row8
st1 {v26.8b}, [x14], x3 //store row5
add v24.8h, v12.8h , v2.8h //operate row4 and row5
st1 {v20.8b}, [x0], x3 //store row2
add v14.8h, v14.8h , v16.8h //operate row8 and row9
st1 {v28.8b}, [x14], x3 //store row6
rshrn v22.8b, v22.8h, #2 //row3
rshrn v30.8b, v30.8h, #2 //row7
rshrn v24.8b, v24.8h, #2 //row4
rshrn v14.8b, v14.8h, #2 //row8
st1 {v22.8b}, [x0], x3 //store row3
st1 {v30.8b}, [x14], x3 //store row7
st1 {v24.8b}, [x0], x3 //store row4
st1 {v14.8b}, [x14], x3 //store row8
// LDMFD sp!,{x12,pc}
pop_v_regs
ret
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_mc_fullx_fully_8x8_av8()
////
//// Detail Description : This function pastes the reference block in the
//// current frame buffer.This function is called for
//// blocks that are not coded and have motion vectors
//// with a half pel resolutionand ..
////
//// Inputs : x0 - out : Current Block Pointer
//// x1 - ref : Refernce Block Pointer
//// x2 - ref_wid : Refernce Block Width
//// x3 - out_wid @ Current Block Width
////
//// Registers Used : x12, x14, v0-v3
////
//// Stack Usage : 64 bytes
////
//// Outputs : The Motion Compensated Block
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_mc_fullx_fully_8x8_av8
impeg2_mc_fullx_fully_8x8_av8:
// STMFD sp!,{x12,x14}
push_v_regs
add x14, x1, x2, lsl #2
add x12, x0, x3, lsl #2
ld1 {v0.8b}, [x1], x2 //load row1
ld1 {v1.8b}, [x14], x2 //load row4
ld1 {v2.8b}, [x1], x2 //load row2
ld1 {v3.8b}, [x14], x2 //load row5
st1 {v0.8b}, [x0], x3 //store row1
st1 {v1.8b}, [x12], x3 //store row4
st1 {v2.8b}, [x0], x3 //store row2
st1 {v3.8b}, [x12], x3 //store row5
ld1 {v0.8b}, [x1], x2 //load row3
ld1 {v1.8b}, [x14], x2 //load row6
ld1 {v2.8b}, [x1], x2 //load row4
ld1 {v3.8b}, [x14], x2 //load row8
st1 {v0.8b}, [x0], x3 //store row3
st1 {v1.8b}, [x12], x3 //store row6
st1 {v2.8b}, [x0], x3 //store row4
st1 {v3.8b}, [x12], x3 //store row8
// LDMFD sp!,{x12,pc}
pop_v_regs
ret
///*
////---------------------------------------------------------------------------
//// Function Name : impeg2_interpolate_av8()
////
//// Detail Description : interpolates two buffers and adds pred
////
//// Inputs : x0 - pointer to src1
//// x1 - pointer to src2
//// x2 - dest buf
//// x3 - dst stride
//// Registers Used : x12, v0-v15
////
//// Stack Usage : 64 bytes
////
//// Outputs : The Motion Compensated Block
////
//// Return Data : None
////
//// Programming Note : <program limitation>
////-----------------------------------------------------------------------------
//*/
.global impeg2_interpolate_av8
impeg2_interpolate_av8:
//STMFD x13!,{x4-x7,x12,x14}
push_v_regs
ldr x4, [x0, #0] //ptr_y src1
ldr x5, [x1, #0] //ptr_y src2
ldr x7, [x2, #0] //ptr_y dst buf
mov x12, #4 //counter for number of blocks
interp_lumablocks_stride:
ld1 {v0.16b}, [x4], #16 //row1 src1
ld1 {v2.16b}, [x4], #16 //row2 src1
ld1 {v4.16b}, [x4], #16 //row3 src1
ld1 {v6.16b}, [x4], #16 //row4 src1
ld1 {v8.16b}, [x5], #16 //row1 src2
ld1 {v10.16b}, [x5], #16 //row2 src2
ld1 {v12.16b}, [x5], #16 //row3 src2
ld1 {v14.16b}, [x5], #16 //row4 src2
urhadd v0.16b, v0.16b , v8.16b //operate on row1
urhadd v2.16b, v2.16b , v10.16b //operate on row2
urhadd v4.16b, v4.16b , v12.16b //operate on row3
urhadd v6.16b, v6.16b , v14.16b //operate on row4
st1 {v0.16b}, [x7], x3 //row1
st1 {v2.16b}, [x7], x3 //row2
st1 {v4.16b}, [x7], x3 //row3
st1 {v6.16b}, [x7], x3 //row4
subs x12, x12, #1
bne interp_lumablocks_stride
lsr x3, x3, #1 //stride >> 1
ldr x4, [x0, #8] //ptr_u src1
ldr x5, [x1, #8] //ptr_u src2
ldr x7 , [x2, #8] //ptr_u dst buf
mov x12, #2 //counter for number of blocks
//chroma blocks
interp_chromablocks_stride:
ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2
urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2
urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4
urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4
urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6
urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6
urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8
urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8
st1 {v0.8b}, [x7], x3 //row1
st1 {v1.8b}, [x7], x3 //row2
st1 {v2.8b}, [x7], x3 //row3
st1 {v3.8b}, [x7], x3 //row4
st1 {v4.8b}, [x7], x3 //row5
st1 {v5.8b}, [x7], x3 //row6
st1 {v6.8b}, [x7], x3 //row7
st1 {v7.8b}, [x7], x3 //row8
ldr x4, [x0, #16] //ptr_v src1
ldr x5, [x1, #16] //ptr_v src2
ldr x7, [x2, #16] //ptr_v dst buf
subs x12, x12, #1
bne interp_chromablocks_stride
//LDMFD x13!,{x4-x7,x12,PC}
pop_v_regs
ret