blob: 546c807e5d0c5b3fbceec8eda44907be08877cd9 [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
//******************************************************************************
//* @file
//* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
//*
//* @brief
//* Contains function definitions for inter prediction interpolation.
//*
//* @author
//* Mohit
//*
//* @par List of Functions:
//*
//* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8()
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
///* All the functions here are replicated from ih264_inter_pred_filters.c
//
///**
///**
///**
//*******************************************************************************
//*
//* @brief
//* This function implements a two stage cascaded six tap filter. It
//* applies the six tap filter in the horizontal direction on the
//* predictor values, followed by applying the same filter in the
//* vertical direction on the output of the first stage. It then averages
//* the output of the 1st stage and the output of the 2nd stage to obtain
//* the quarter pel values. The six tap filtering operation is described
//* in sec 8.4.2.2.1 titled "Luma sample interpolation process".
//*
//* @par Description:
//* This function is called to obtain pixels lying at the following
//* location (1/2,1/4) or (1/2,3/4). The function interpolates
//* the predictors first in the horizontal direction and then in the
//* vertical direction to output the (1/2,1/2). It then averages
//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
//* or (1/2,3/4) depending on the offset.
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @param[in] pu1_tmp: temporary buffer
//*
//* @param[in] dydx: x and y reference offset for qpel calculations
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/;
//void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,,
// WORD32 dst_strd,
// WORD32 ht,
// WORD32 wd,
// UWORD8* pu1_tmp,
// UWORD32 dydx)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ht
// x5 => wd
// x7 => dydx
// x9 => *pu1_tmp
.text
.p2align 2
.include "ih264_neon_macros.s"
.global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
// store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd
sub x0, x0, #2 // pu1_src-2
mov x9, x6
lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
add x7, x7, #2
mov x6, #48
madd x7, x7, x6, x9
subs x12, x5, #4 //if wd=4 branch to loop_4
beq loop_4_start
subs x12, x5, #8 //if wd=8 branch to loop_8
beq loop_8_start
//when wd=16
movi v22.8h, #20 // Filter coeff 0x14 into Q11
movi v24.8h, #5 // Filter coeff 0x5 into Q12
add x8, x0, #8
add x14, x1, #8
add x10, x9, #8
mov x12, x4
add x11, x7, #8
loop_16_lowhalf_start:
ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
ext v5.8b, v0.8b , v1.8b , #5
uaddl v6.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v6.8h, v8.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v8.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
mls v6.8h, v8.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v8.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v10.8h, v2.8b, v3.8b
st1 {v6.4s}, [x9], x6 // store temp buffer 0
ext v4.8b, v0.8b , v1.8b , #4
mla v8.8h, v10.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v10.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
mls v8.8h, v10.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v10.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v12.8h, v2.8b, v3.8b
st1 {v8.4s}, [x9], x6 // store temp buffer 1
ext v4.8b, v0.8b , v1.8b , #4
mla v10.8h, v12.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v12.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
mls v10.8h, v12.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v12.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v14.8h, v2.8b, v3.8b
st1 {v10.4s}, [x9], x6 // store temp buffer 2
ext v4.8b, v0.8b , v1.8b , #4
mla v12.8h, v14.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v14.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
mls v12.8h, v14.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v14.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v2.8b, v3.8b
st1 {v12.4s}, [x9], x6 // store temp buffer 3
ext v4.8b, v0.8b , v1.8b , #4
mla v14.8h, v16.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v16.8h, v1.8b, v4.8b
mls v14.8h, v16.8h , v24.8h
loop_16_lowhalf:
ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v0.8b, v5.8b
st1 {v14.4s}, [x9], x6 // store temp buffer 4
uaddl v18.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v16.8h, v18.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v28.8h, v8.8h , v14.8h
uaddl v18.8h, v1.8b, v4.8b
add v30.8h, v10.8h , v12.8h
mls v16.8h, v18.8h , v24.8h
ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v20.8h, v0.8b, v5.8b
st1 {v16.4s}, [x9], x6 // store temp buffer x5
saddl v18.4s, v6.4h, v16.4h
ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
saddl2 v6.4s, v6.8h, v16.8h
sqrshrun v26.8b, v26.8h, #5
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v20.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v28.8h, v10.8h , v16.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v12.8h , v14.8h
mls v20.8h, v2.8h , v24.8h
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
urhadd v26.8b, v18.8b , v26.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
st1 {v20.4s}, [x9], x6 // store temp buffer x6
saddl v18.4s, v8.4h, v20.4h
saddl2 v6.4s, v8.8h, v20.8h
ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
st1 {v26.2s}, [x1], x3 // store row 0
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
sqrshrun v28.8b, v8.8h, #5
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v0.8b, v5.8b
uaddl v2.8h, v2.8b, v3.8b
sqrshrun v18.4h, v18.4s, #10
ext v4.8b, v0.8b , v1.8b , #4
sqrshrun v19.4h, v6.4s, #10
mla v8.8h, v2.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v26.8h, v12.8h , v20.8h
uaddl v2.8h, v1.8b, v4.8b
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
add v30.8h, v14.8h , v16.8h
mls v8.8h, v2.8h , v24.8h
ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
urhadd v28.8b, v28.8b , v18.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
st1 {v28.2s}, [x1], x3 // store row 1
uaddl v28.8h, v0.8b, v5.8b
st1 {v8.4s}, [x9], x6 // store temp buffer x7
saddl v18.4s, v10.4h, v8.4h
saddl2 v6.4s, v10.8h, v8.8h
ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v26.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v26.8h, v24.8h
sqrshrun v26.8b, v10.8h, #5
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v28.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v10.8h, v14.8h , v8.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v16.8h , v20.8h
mls v28.8h, v2.8h , v24.8h
uqxtn v27.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v27.s[1], v19.s[0]
saddl v18.4s, v12.4h, v28.4h
saddl2 v6.4s, v12.8h, v28.8h
urhadd v26.8b, v26.8b , v27.8b
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v10.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v10.8h, v24.8h
st1 {v26.2s}, [x1], x3 // store row 2
st1 {v28.2s, v29.2s}, [x9]
sqrshrun v18.4h, v18.4s, #10
mov v10.16b, v20.16b
mov v11.16b, v21.16b
ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
sqrshrun v19.4h, v6.4s, #10
subs x4, x4, #4
sqrshrun v30.8b, v30.8h, #5
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
mov v12.16b, v8.16b
mov v13.16b, v9.16b
mov v6.16b, v14.16b
mov v7.16b, v15.16b
urhadd v30.8b, v18.8b , v30.8b
mov v8.16b, v16.16b
mov v9.16b, v17.16b
mov v14.16b, v28.16b
mov v15.16b, v29.16b
st1 {v30.2s}, [x1], x3 // store row 3
bgt loop_16_lowhalf // looping if height =16
loop_16_highhalf_start:
ld1 {v0.2s, v1.2s}, [x8], x2
ext v5.8b, v0.8b , v1.8b , #5
uaddl v6.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v6.8h, v8.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v8.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x8], x2
mls v6.8h, v8.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v8.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v10.8h, v2.8b, v3.8b
st1 {v6.4s}, [x10], x6
ext v4.8b, v0.8b , v1.8b , #4
mla v8.8h, v10.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v10.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x8], x2
mls v8.8h, v10.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v10.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v12.8h, v2.8b, v3.8b
st1 {v8.4s}, [x10], x6
ext v4.8b, v0.8b , v1.8b , #4
mla v10.8h, v12.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v12.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x8], x2
mls v10.8h, v12.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v12.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v14.8h, v2.8b, v3.8b
st1 {v10.4s}, [x10], x6
ext v4.8b, v0.8b , v1.8b , #4
mla v12.8h, v14.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v14.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x8], x2
mls v12.8h, v14.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v14.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v2.8b, v3.8b
st1 {v12.4s}, [x10], x6
ext v4.8b, v0.8b , v1.8b , #4
mla v14.8h, v16.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v16.8h, v1.8b, v4.8b
mls v14.8h, v16.8h , v24.8h
loop_16_highhalf:
ld1 {v0.2s, v1.2s}, [x8], x2
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v0.8b, v5.8b
st1 {v14.4s}, [x10], x6
uaddl v18.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v16.8h, v18.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v28.8h, v8.8h , v14.8h
uaddl v18.8h, v1.8b, v4.8b
add v30.8h, v10.8h , v12.8h
mls v16.8h, v18.8h , v24.8h
ld1 {v0.2s, v1.2s}, [x8], x2
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v20.8h, v0.8b, v5.8b
st1 {v16.4s}, [x10], x6
saddl v18.4s, v6.4h, v16.4h
ld1 {v26.4s}, [x11], x6
saddl2 v6.4s, v6.8h, v16.8h
sqrshrun v26.8b, v26.8h, #5
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v20.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v28.8h, v10.8h , v16.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v12.8h , v14.8h
mls v20.8h, v2.8h , v24.8h
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
ld1 {v0.2s, v1.2s}, [x8], x2
urhadd v26.8b, v18.8b , v26.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
st1 {v20.4s}, [x10], x6
saddl v18.4s, v8.4h, v20.4h
saddl2 v6.4s, v8.8h, v20.8h
ld1 {v8.4s}, [x11], x6
st1 {v26.2s}, [x14], x3 //store row 0
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
sqrshrun v28.8b, v8.8h, #5
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v0.8b, v5.8b
uaddl v2.8h, v2.8b, v3.8b
sqrshrun v18.4h, v18.4s, #10
ext v4.8b, v0.8b , v1.8b , #4
sqrshrun v19.4h, v6.4s, #10
mla v8.8h, v2.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v26.8h, v12.8h , v20.8h
uaddl v2.8h, v1.8b, v4.8b
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
add v30.8h, v14.8h , v16.8h
mls v8.8h, v2.8h , v24.8h
ld1 {v0.2s, v1.2s}, [x8], x2
urhadd v28.8b, v28.8b , v18.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
st1 {v28.2s}, [x14], x3 //store row 1
uaddl v28.8h, v0.8b, v5.8b
st1 {v8.4s}, [x10], x6
saddl v18.4s, v10.4h, v8.4h
saddl2 v6.4s, v10.8h, v8.8h
ld1 {v10.4s}, [x11], x6
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v26.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v26.8h, v24.8h
sqrshrun v26.8b, v10.8h, #5
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v28.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v10.8h, v14.8h , v8.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v16.8h , v20.8h
mls v28.8h, v2.8h , v24.8h
uqxtn v27.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v27.s[1], v19.s[0]
saddl v18.4s, v12.4h, v28.4h
saddl2 v6.4s, v12.8h, v28.8h
urhadd v26.8b, v26.8b , v27.8b
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v10.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v10.8h, v24.8h
st1 {v26.2s}, [x14], x3 // store row 2
st1 {v28.4s}, [x10]
sqrshrun v18.4h, v18.4s, #10
mov v10.16b, v20.16b
mov v11.16b, v21.16b
ld1 {v30.4s}, [x11], x6
sqrshrun v19.4h, v6.4s, #10
subs x12, x12, #4
sqrshrun v30.8b, v30.8h, #5
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
mov v12.16b, v8.16b
mov v13.16b, v9.16b
mov v6.16b, v14.16b
mov v7.16b, v15.16b
urhadd v30.8b, v18.8b , v30.8b
mov v8.16b, v16.16b
mov v9.16b, v17.16b
mov v14.16b, v28.16b
mov v15.16b, v29.16b
st1 {v30.2s}, [x14], x3 // store row 3
bgt loop_16_highhalf // looping if height = 8 or 16
b end_func
loop_8_start:
movi v22.8h, #0x14 // Filter coeff 20 into Q11
movi v24.8h, #5 // Filter coeff 5 into Q12
ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
ext v5.8b, v0.8b , v1.8b , #5
uaddl v6.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v6.8h, v8.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v8.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
mls v6.8h, v8.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v8.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v10.8h, v2.8b, v3.8b
st1 {v6.4s}, [x9], x6 // store temp buffer 0
ext v4.8b, v0.8b , v1.8b , #4
mla v8.8h, v10.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v10.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
mls v8.8h, v10.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v10.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v12.8h, v2.8b, v3.8b
st1 {v8.4s}, [x9], x6 // store temp buffer 1
ext v4.8b, v0.8b , v1.8b , #4
mla v10.8h, v12.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v12.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
mls v10.8h, v12.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v12.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v14.8h, v2.8b, v3.8b
st1 {v10.4s}, [x9], x6 // store temp buffer 2
ext v4.8b, v0.8b , v1.8b , #4
mla v12.8h, v14.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v14.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
mls v12.8h, v14.8h , v24.8h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v14.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v2.8b, v3.8b
st1 {v12.4s}, [x9], x6 // store temp buffer 3
ext v4.8b, v0.8b , v1.8b , #4
mla v14.8h, v16.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v16.8h, v1.8b, v4.8b
mls v14.8h, v16.8h , v24.8h
loop_8:
ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v0.8b, v5.8b
st1 {v14.4s}, [x9], x6 // store temp buffer 4
uaddl v18.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v16.8h, v18.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v28.8h, v8.8h , v14.8h
uaddl v18.8h, v1.8b, v4.8b
add v30.8h, v10.8h , v12.8h
mls v16.8h, v18.8h , v24.8h
ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v20.8h, v0.8b, v5.8b
st1 {v16.4s}, [x9], x6 // store temp buffer x5
saddl v18.4s, v6.4h, v16.4h
ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
saddl2 v6.4s, v6.8h, v16.8h
sqrshrun v26.8b, v26.8h, #5
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v20.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v28.8h, v10.8h , v16.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v12.8h , v14.8h
mls v20.8h, v2.8h , v24.8h
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
urhadd v26.8b, v18.8b , v26.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
st1 {v20.4s}, [x9], x6 // store temp buffer x6
saddl v18.4s, v8.4h, v20.4h
saddl2 v6.4s, v8.8h, v20.8h
ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
st1 {v26.2s}, [x1], x3 // store row 0
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v28.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v28.8h, v24.8h
sqrshrun v28.8b, v8.8h, #5
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v0.8b, v5.8b
uaddl v2.8h, v2.8b, v3.8b
sqrshrun v18.4h, v18.4s, #10
ext v4.8b, v0.8b , v1.8b , #4
sqrshrun v19.4h, v6.4s, #10
mla v8.8h, v2.8h , v22.8h
ext v1.8b, v0.8b , v1.8b , #1
add v26.8h, v12.8h , v20.8h
uaddl v2.8h, v1.8b, v4.8b
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
add v30.8h, v14.8h , v16.8h
mls v8.8h, v2.8h , v24.8h
ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
urhadd v28.8b, v28.8b , v18.8b
ext v5.8b, v0.8b , v1.8b , #5
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
st1 {v28.2s}, [x1], x3 // store row 1
uaddl v28.8h, v0.8b, v5.8b
st1 {v8.4s}, [x9], x6 // store temp buffer x7
saddl v18.4s, v10.4h, v8.4h
saddl2 v6.4s, v10.8h, v8.8h
ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v26.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v26.8h, v24.8h
sqrshrun v26.8b, v10.8h, #5
uaddl v2.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v28.8h, v2.8h , v22.8h
sqrshrun v18.4h, v18.4s, #10
ext v1.8b, v0.8b , v1.8b , #1
sqrshrun v19.4h, v6.4s, #10
add v10.8h, v14.8h , v8.8h
uaddl v2.8h, v1.8b, v4.8b
add v30.8h, v16.8h , v20.8h
mls v28.8h, v2.8h , v24.8h
uqxtn v27.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v27.s[1], v19.s[0]
saddl v18.4s, v12.4h, v28.4h
saddl2 v6.4s, v12.8h, v28.8h
urhadd v26.8b, v26.8b , v27.8b
smlal v18.4s, v30.4h, v22.4h
smlsl v18.4s, v10.4h, v24.4h
smlal2 v6.4s, v30.8h, v22.8h
smlsl2 v6.4s, v10.8h, v24.8h
st1 {v26.2s}, [x1], x3 // store row 2
st1 {v28.2s, v29.2s}, [x9]
sqrshrun v18.4h, v18.4s, #10
mov v10.16b, v20.16b
mov v11.16b, v21.16b
ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
sqrshrun v19.4h, v6.4s, #10
subs x4, x4, #4
sqrshrun v30.8b, v30.8h, #5
uqxtn v18.8b, v18.8h
uqxtn v19.8b, v19.8h
mov v18.s[1], v19.s[0]
mov v12.16b, v8.16b
mov v13.16b, v9.16b
mov v6.16b, v14.16b
mov v7.16b, v15.16b
urhadd v30.8b, v18.8b , v30.8b
mov v8.16b, v16.16b
mov v9.16b, v17.16b
mov v14.16b, v28.16b
mov v15.16b, v29.16b
st1 {v30.2s}, [x1], x3 // store row 3
bgt loop_8 //if height =8 or 16 loop
b end_func
loop_4_start:
movi v22.8h, #20 // Filter coeff 20 into D22
movi v23.8h, #5 // Filter coeff 5 into D23
ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load
ext v5.8b, v0.8b , v1.8b , #5
uaddl v6.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v8.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v6.4h, v8.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v8.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load
mls v6.4h, v8.4h , v23.4h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v8.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v10.8h, v2.8b, v3.8b
st1 {v6.2s}, [x9], x6 // store temp buffer 0
ext v4.8b, v0.8b , v1.8b , #4
mla v8.4h, v10.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v10.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load
mls v8.4h, v10.4h , v23.4h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v10.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v12.8h, v2.8b, v3.8b
st1 {v8.2s}, [x9], x6 // store temp buffer 1
ext v4.8b, v0.8b , v1.8b , #4
mla v10.4h, v12.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v12.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load
mls v10.4h, v12.4h , v23.4h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v12.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v14.8h, v2.8b, v3.8b
st1 {v10.2s}, [x9], x6 // store temp buffer 2
ext v4.8b, v0.8b , v1.8b , #4
mla v12.4h, v14.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v14.8h, v1.8b, v4.8b
ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load
mls v12.4h, v14.4h , v23.4h
ext v5.8b, v0.8b , v1.8b , #5
uaddl v14.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v16.8h, v2.8b, v3.8b
ext v4.8b, v0.8b , v1.8b , #4
mla v14.4h, v16.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v16.8h, v1.8b, v4.8b
st1 {v12.2s}, [x9], x6 // store temp buffer 3
mls v14.4h, v16.4h , v23.4h
loop_4:
ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load
ext v5.8b, v0.8b , v1.8b , #5
uaddl v16.8h, v0.8b, v5.8b
ext v2.8b, v0.8b , v1.8b , #2
ext v3.8b, v0.8b , v1.8b , #3
uaddl v18.8h, v2.8b, v3.8b
st1 {v14.2s}, [x9], x6 // store temp buffer 4
ext v4.8b, v0.8b , v1.8b , #4
mla v16.4h, v18.4h , v22.4h
ext v1.8b, v0.8b , v1.8b , #1
uaddl v18.8h, v1.8b, v4.8b
add v2.4h, v10.4h , v12.4h
mls v16.4h, v18.4h , v23.4h
add v3.4h, v8.4h , v14.4h
ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load
ext v25.8b, v18.8b , v19.8b , #5
uaddl v26.8h, v18.8b, v25.8b
ext v20.8b, v18.8b , v19.8b , #2
st1 {v16.2s}, [x9], x6 // store temp buffer 5
saddl v0.4s, v6.4h, v16.4h
smlal v0.4s, v2.4h, v22.4h
ext v21.8b, v18.8b , v19.8b , #3
uaddl v28.8h, v20.8b, v21.8b
ext v24.8b, v18.8b , v19.8b , #4
smlsl v0.4s, v3.4h, v23.4h
mla v26.4h, v28.4h , v22.4h
ext v19.8b, v18.8b , v19.8b , #1
uaddl v28.8h, v19.8b, v24.8b
add v2.4h, v12.4h , v14.4h
mls v26.4h, v28.4h , v23.4h
sqrshrun v0.4h, v0.4s, #0xa
add v3.4h, v10.4h , v16.4h
ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load
ext v25.8b, v18.8b , v19.8b , #5
uqxtn v11.8b, v0.8h
uaddl v28.8h, v18.8b, v25.8b
st1 {v26.2s}, [x9], x6 // store temp buffer 6
//Q3 available here
ld1 {v6.2s}, [x7], x6 // load from temp buffer 0
ld1 {v7.2s}, [x7], x6 // load from temp buffer 1
sqrshrun v9.8b, v6.8h, #5
sqrshrun v7.8b, v7.8h, #5
mov v9.s[1], v7.s[0]
ext v20.8b, v18.8b , v19.8b , #2
saddl v0.4s, v8.4h, v26.4h
smlal v0.4s, v2.4h, v22.4h
ext v21.8b, v18.8b , v19.8b , #3
uaddl v6.8h, v20.8b, v21.8b
ext v24.8b, v18.8b , v19.8b , #4
smlsl v0.4s, v3.4h, v23.4h
mla v28.4h, v6.4h , v22.4h
ext v19.8b, v18.8b , v19.8b , #1
uaddl v6.8h, v19.8b, v24.8b
add v2.4h, v14.4h , v16.4h
mls v28.4h, v6.4h , v23.4h
sqrshrun v0.4h, v0.4s, #0xa
add v3.4h, v12.4h , v26.4h
ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load
ext v25.8b, v18.8b , v19.8b , #5
uqxtn v13.8b, v0.8h
trn1 v11.2s, v11.2s, v13.2s
trn2 v13.2s, v11.2s, v13.2s
saddl v0.4s, v10.4h, v28.4h
urhadd v9.8b, v9.8b , v11.8b
st1 {v28.2s}, [x9], x6 // store temp buffer 7
smlal v0.4s, v2.4h, v22.4h
uaddl v30.8h, v18.8b, v25.8b
st1 {v9.s}[0], [x1], x3 // store row 0
ext v20.8b, v18.8b , v19.8b , #2
st1 {v9.s}[1], [x1], x3 // store row 1
ext v21.8b, v18.8b , v19.8b , #3
smlsl v0.4s, v3.4h, v23.4h
uaddl v8.8h, v20.8b, v21.8b
ext v24.8b, v18.8b , v19.8b , #4
mla v30.4h, v8.4h , v22.4h
ext v19.8b, v18.8b , v19.8b , #1
uaddl v8.8h, v19.8b, v24.8b
sqrshrun v0.4h, v0.4s, #0xa
add v2.4h, v16.4h , v26.4h
mls v30.4h, v8.4h , v23.4h
uqxtn v4.8b, v0.8h
add v3.4h, v14.4h , v28.4h
saddl v0.4s, v12.4h, v30.4h
st1 {v30.2s}, [x9]
smlal v0.4s, v2.4h, v22.4h
ld1 {v8.2s}, [x7], x6 // load from temp buffer 2
ld1 {v9.2s}, [x7], x6 // load from temp buffer 3
smlsl v0.4s, v3.4h, v23.4h
subs x4, x4, #4
sqrshrun v10.8b, v8.8h, #5
sqrshrun v9.8b, v9.8h, #5
mov v10.s[1], v9.s[0]
mov v12.8b, v28.8b
sqrshrun v0.4h, v0.4s, #0xa
mov v6.8b, v14.8b
mov v8.8b, v16.8b
uqxtn v5.8b, v0.8h
trn1 v4.2s, v4.2s, v5.2s
trn2 v5.2s, v4.2s, v5.2s
urhadd v4.8b, v4.8b , v10.8b
mov v10.8b, v26.8b
mov v14.8b, v30.8b
st1 {v4.s}[0], [x1], x3 // store row 2
st1 {v4.s}[1], [x1], x3 // store row 3
bgt loop_4
end_func:
//Restoring registers from stack
ldp x19, x20, [sp], #16
pop_v_regs
ret