blob: fc86ffaa6ff6955595fff36b8419300a18b61486 [file] [log] [blame]
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//* ihevc_intra_pred_filters_dc.s
//*
//* @brief
//* contains function definitions for intra prediction dc filtering.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* @author
//* akshaya mukund
//*
//* @par list of functions:
//*
//*
//* @remarks
//* none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//* luma intraprediction filter for dc input
//*
//* @par description:
//*
//* @param[in] pu1_ref
//* uword8 pointer to the source
//*
//* @param[out] pu1_dst
//* uword8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] pi1_coeff
//* word8 pointer to the planar coefficients
//*
//* @param[in] nt
//* size of tranform block
//*
//* @param[in] mode
//* type of filtering
//*
//* @returns
//*
//* @remarks
//* none
//*
//*******************************************************************************
//*/
//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
// word32 src_strd,
// uword8 *pu1_dst,
// word32 dst_strd,
// word32 nt,
// word32 mode)
//
//**************variables vs registers*****************************************
//x0 => *pu1_ref
//x1 => src_strd
//x2 => *pu1_dst
//x3 => dst_strd
//stack contents from #40
// nt
// mode
// pi1_coeff
.text
.align 4
.include "ihevc_neon_macros.s"
.globl ihevc_intra_pred_luma_dc_av8
.type ihevc_intra_pred_luma_dc_av8, %function
ihevc_intra_pred_luma_dc_av8:
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
stp x19, x20,[sp,#-16]!
//********** testing
//mov x6, #128
//b prologue_cpy_32
//********** testing
mov x11, #2 //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
mov x9, #0
mov v17.s[0], w11
mov v17.s[1], w9
clz w5,w4
add x6, x0, x4 //&src[nt]
sub x20, x5, #32 //log2nt
neg x5, x20
add x7, x0, x4, lsl #1 //&src[2nt]
add x8, x7, #1 //&src[2nt+1]
mvn x5, x5
add x5, x5, #1
dup v7.2s,w5
ldrb w14, [x8]
sxtw x14,w14
shl d7, d7,#32
sub x9, x7, #1 //&src[2nt-1]
sshr d7, d7,#32
mov x7, x8 //x7 also stores 2nt+1
ldrb w12, [x9]
sxtw x12,w12
add x14, x14, x12 //src[2nt+1] + src[2nt-1]
add x14, x14, x11 //src[2nt+1] + src[2nt-1] + 2
cmp x4, #4
beq dc_4
mov x10, x4 //nt
add_loop:
ld1 {v0.8b},[x6],#8 //load from src[nt]
mov x5, #0 //
ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
uaddlp v2.4h, v0.8b
mov v6.s[0], w4
mov v6.s[1], w5 //store nt to accumulate
uaddlp v3.4h, v1.8b
ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 8)
ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 8)
add v4.4h, v2.4h , v3.4h
uaddlp v5.2s, v4.4h
uadalp v6.1d, v5.2s //accumulate all inp into d6 (end for nt==8)
subs x10, x10,#8
beq epil_add_loop
core_loop_add:
uaddlp v2.4h, v0.8b
subs x10, x10,#8
uaddlp v3.4h, v1.8b
add v4.4h, v2.4h , v3.4h
ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 16)
uaddlp v5.2s, v4.4h
ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 16)
uadalp v6.1d, v5.2s //accumulate all inp into d6
bne core_loop_add
epil_add_loop:
sshl d18, d6, d7 //(dc_val) shr by log2nt+1
cmp x4, #32
mov v28.s[0], w14
mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
mov x20,#128
csel x6, x20, x6,eq
dup v16.8b, v18.b[0] //dc_val
shl d25, d18,#1 //2*dc
beq prologue_cpy_32
add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
mov x20,#0
csel x6, x20, x6,ne //nt
ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
csel x10, x4, x10,ne
add d23, d25 , d18 //3*dc
sub x12, x3, x3, lsl #3 //-7*strd
add d23, d23 , d17 //3*dc + 2
add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8)
dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
prologue_col:
//0th column and 0-7 rows done here
//x8 and x9 (2nt+1+col 2nt-1-row)
mov x8, x7 //&src[2nt+1]
add x0, x0, #8 //strd - nt + 8
ld1 {v0.8b},[x8],#8 //col 1::7 load (prol)
sub x9, x9, #7 //&src[2nt-1-row]
ld1 {v1.8b},[x9] //row 7::1 (0 also) load (prol)
sub x9, x9, #8
uxtl v20.8h, v0.8b
ld1 {v6.8b},[x8] //col 8::15 load (prol extra)
add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
uxtl v22.8h, v1.8b
sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
uxtl v26.8h, v6.8b
add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
movi d19, #0x00000000000000ff //
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra)
rev64 v3.8b, v3.8b
st1 {v19.8b},[x2], x3 //store row 0 (prol)
sshr d3, d3,#8 //row 0 shift (prol) (first value to be ignored)
movi d20, #0x00000000000000ff //byte mask row 1 (prol)
loop_again_col_row:
bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
movi d21, #0x00000000000000ff //byte mask row 2 (prol)
sshr d3, d3,#8 //row 1 shift (prol)
st1 {v20.8b},[x2], x3 //store row 1 (prol)
sqshrun v4.8b, v26.8h,#2 //columns shx2 movn (prol extra)
bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
movi d20, #0x00000000000000ff //byte mask row 3 (prol)
sshr d3, d3,#8 //row 2 shift (prol)
st1 {v21.8b},[x2], x3 //store row 2 (prol)
bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
movi d21, #0x00000000000000ff //byte mask row 4 (prol)
sshr d3, d3,#8 //row 3 shift (prol)
st1 {v20.8b},[x2], x3 //store row 3 (prol)
bsl v21.8b, v3.8b , v16.8b //row 4 (prol)
movi d20, #0x00000000000000ff //byte mask row 5 (prol)
sshr d3, d3,#8 //row 4 shift (prol)
st1 {v21.8b},[x2], x3 //store row 4 (prol)
bsl v20.8b, v3.8b , v16.8b //row 5 (prol)
movi d21, #0x00000000000000ff //byte mask row 6 (prol)
sshr d3, d3,#8 //row 5 shift (prol)
st1 {v20.8b},[x2], x3 //store row 5 (prol)
ld1 {v1.8b},[x9] //row 8::15 load (prol extra)
bsl v21.8b, v3.8b , v16.8b //row 6 (prol)
uxtl v22.8h, v1.8b
movi d20, #0x00000000000000ff //byte mask row 7 (prol)
sshr d3, d3,#8 //row 6 shift (prol)
st1 {v21.8b},[x2], x3 //store row 6 (prol)
bsl v20.8b, v3.8b , v16.8b //row 7 (prol)
add v22.8h, v22.8h , v24.8h //row 8::15 add 3dc+2 (prol extra)
sshr d3, d3,#8 //row 7 shift (prol)
st1 {v20.8b},[x2], x12 //store row 7 (prol)
subs x10, x10, #8 //counter for cols
beq end_func
blt copy_16
movi d20, #0x00000000000000ff //byte mask row 9 (prol)
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
rev64 v3.8b, v3.8b
st1 {v4.8b},[x2], x3 //store 2nd col (for 16x16)
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x0 //go to next row for 16
bsl v20.8b, v3.8b , v16.8b //row 9 (prol)
subs x10, x10, #8
st1 {v20.8b},[x2], x3 //store row 9 (prol)
sshr d3, d3,#8 //row 9 shift (prol)
movi d20, #0x00000000000000ff //byte mask row 9 (prol)
b loop_again_col_row
copy_16:
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2], x3
st1 {v16.8b},[x2]
b end_func
prologue_cpy_32:
mov x9, #128
//sub x7, x3, #-24
add x5, x2, x3
add x8, x5, x3
add x10, x8, x3
dup v20.16b, v16.b[0]
lsl x6, x3, #2
sub x6, x6, #16
st1 {v20.16b}, [x2],#16
st1 {v20.16b}, [x5],#16
st1 {v20.16b}, [x8],#16
st1 {v20.16b}, [x10],#16
st1 {v20.16b}, [x2], x6
st1 {v20.16b}, [x5], x6
st1 {v20.16b}, [x8], x6
st1 {v20.16b}, [x10], x6
sub x9, x9, #32 //32x32 prol/epil counter dec
kernel_copy:
st1 {v20.16b}, [x2],#16
st1 {v20.16b}, [x5],#16
st1 {v20.16b}, [x8],#16
st1 {v20.16b}, [x10],#16
st1 {v20.16b}, [x2], x6
st1 {v20.16b}, [x5], x6
st1 {v20.16b}, [x8], x6
st1 {v20.16b}, [x10], x6
subs x9, x9, #32
st1 {v20.16b}, [x2],#16
st1 {v20.16b}, [x5],#16
st1 {v20.16b}, [x8],#16
st1 {v20.16b}, [x10],#16
st1 {v20.16b}, [x2], x6
st1 {v20.16b}, [x5], x6
st1 {v20.16b}, [x8], x6
st1 {v20.16b}, [x10], x6
bne kernel_copy
epilogue_copy:
st1 {v20.16b}, [x2],#16
st1 {v20.16b}, [x5],#16
st1 {v20.16b}, [x8],#16
st1 {v20.16b}, [x10],#16
st1 {v20.16b}, [x2]
st1 {v20.16b}, [x5]
st1 {v20.16b}, [x8]
st1 {v20.16b}, [x10]
b end_func
dc_4:
ld1 {v0.8b},[x6],#8 //load from src[nt]
ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
uaddlp v2.4h, v0.8b
mov x5, #0 //
mov v6.s[0], w4
mov v6.s[1], w5 //store nt to accumulate
uaddlp v3.4h, v1.8b
add v4.4h, v2.4h , v3.4h
uaddlp v5.2s, v4.4h
movi d30, #0x00000000ffffffff
and v5.8b, v5.8b , v30.8b
mov v28.s[0], w14
mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8)
sshl d18, d6, d7 //(dc_val) shr by log2nt+1
mov x8, x7 //&src[2nt+1]
shl d25, d18,#1 //2*dc
sub x9, x9, #3 //&src[2nt-1-row]
dup v16.8b, v18.b[0] //dc_val
add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
sub x12, x3, x3, lsl #2 //-3*strd
add d23, d25 , d18 //3*dc
add d23, d23 , d17 //3*dc + 2
add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4)
dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
sub x0, x3, x4 //strd - nt
ld1 {v0.8b},[x8] //col 1::3 load (prol)
ld1 {v1.8b},[x9] //row 3::1 (0 also) load (prol)
uxtl v20.8h, v0.8b
uxtl v22.8h, v1.8b
add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
movi d19, #0x00000000000000ff //
sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
movi d20, #0x00000000000000ff //byte mask row 1 (prol)
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
rev64 v3.8b, v3.8b
st1 {v19.s}[0],[x2], x3 //store row 0 (prol)
sshr d3, d3,#40 //row 0 shift (prol) (first value to be ignored)
movi d21, #0x00000000000000ff //byte mask row 2 (prol)
bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
sshr d3, d3,#8 //row 1 shift (prol)
st1 {v20.s}[0],[x2], x3 //store row 1 (prol)
bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
movi d20, #0x00000000000000ff //byte mask row 3 (prol)
sshr d3, d3,#8 //row 2 shift (prol)
st1 {v21.s}[0],[x2], x3 //store row 2 (prol)
bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
st1 {v20.s}[0],[x2] //store row 3 (prol)
epilogue_end:
end_func:
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
ldp x19, x20,[sp],#16
ret