blob: c1847b5200d7c7e6f593b75d9e46156ec34917cc [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
//******************************************************************************
//* @file
//* ih264_intra_pred_luma_16x16_av8.s
//*
//* @brief
//* Contains function definitions for intra 16x16 Luma prediction .
//*
//* @author
//* Ittiam
//*
//* @par List of Functions:
//*
//* - ih264_intra_pred_luma_16x16_mode_vert_av8()
//* - ih264_intra_pred_luma_16x16_mode_horz_av8()
//* - ih264_intra_pred_luma_16x16_mode_dc_av8()
//* - ih264_intra_pred_luma_16x16_mode_plane_av8()
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
///* All the functions here are replicated from ih264_intra_pred_filters.c
//
///**
///**
///**
//
.text
.p2align 2
.include "ih264_neon_macros.s"
.extern ih264_gai1_intrapred_luma_plane_coeffs
///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_vert
//*
//* @brief
//* Perform Intra prediction for luma_16x16 mode:vertical
//*
//* @par Description:
//* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_vert_av8
ih264_intra_pred_luma_16x16_mode_vert_av8:
push_v_regs
add x0, x0, #17
ld1 {v0.8b, v1.8b}, [x0]
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
st1 {v0.8b, v1.8b}, [x1], x3
pop_v_regs
ret
///******************************************************************************
///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_horz
//*
//* @brief
//* Perform Intra prediction for luma_16x16 mode:horizontal
//*
//* @par Description:
//* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels(Not used in this function)
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_horz_av8
ih264_intra_pred_luma_16x16_mode_horz_av8:
push_v_regs
ld1 {v0.16b}, [x0]
dup v10.16b, v0.b[15]
dup v11.16b, v0.b[14]
dup v12.16b, v0.b[13]
dup v13.16b, v0.b[12]
st1 {v10.16b}, [x1], x3
dup v14.16b, v0.b[11]
st1 {v11.16b}, [x1], x3
dup v15.16b, v0.b[10]
st1 {v12.16b}, [x1], x3
dup v16.16b, v0.b[9]
st1 {v13.16b}, [x1], x3
dup v17.16b, v0.b[8]
st1 {v14.16b}, [x1], x3
dup v18.16b, v0.b[7]
st1 {v15.16b}, [x1], x3
dup v19.16b, v0.b[6]
st1 {v16.16b}, [x1], x3
dup v20.16b, v0.b[5]
st1 {v17.16b}, [x1], x3
dup v21.16b, v0.b[4]
st1 {v18.16b}, [x1], x3
dup v22.16b, v0.b[3]
st1 {v19.16b}, [x1], x3
dup v23.16b, v0.b[2]
st1 {v20.16b}, [x1], x3
dup v24.16b, v0.b[1]
st1 {v21.16b}, [x1], x3
dup v25.16b, v0.b[0]
st1 {v22.16b}, [x1], x3
st1 {v23.16b}, [x1], x3
st1 {v24.16b}, [x1], x3
st1 {v25.16b}, [x1], x3
pop_v_regs
ret
///******************************************************************************
///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_dc
//*
//* @brief
//* Perform Intra prediction for luma_16x16 mode:DC
//*
//* @par Description:
//* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_dc_av8
ih264_intra_pred_luma_16x16_mode_dc_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
sub v0.16b, v0.16b, v0.16b
sub v1.16b, v1.16b, v1.16b
mov w10, #0
mov w11 , #3
ands x6, x4, #0x01
beq top_available //LEFT NOT AVAILABLE
ld1 {v0.16b}, [x0]
add w10, w10, #8
add w11, w11, #1
top_available:
ands x6, x4, #0x04
beq none_available
add x6, x0, #17
ld1 {v1.16b}, [x6]
add w10, w10, #8
add w11, w11, #1
b summation
none_available:
cmp x4, #0
bne summation
mov w15, #128
dup v20.16b, w15
b store
summation:
uaddl v2.8h, v0.8b, v1.8b
uaddl2 v3.8h, v0.16b, v1.16b
dup v10.8h, w10
neg w11, w11
dup v20.8h, w11
add v0.8h, v2.8h, v3.8h
mov v1.d[0], v0.d[1]
add v0.4h, v0.4h, v1.4h
addp v0.4h, v0.4h , v0.4h
addp v0.4h, v0.4h , v0.4h
add v0.4h, v0.4h, v10.4h
uqshl v0.8h, v0.8h, v20.8h
sqxtun v0.8b, v0.8h
dup v20.16b, v0.b[0]
store:
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
st1 { v20.16b}, [x1], x3
end_func:
ldp x19, x20, [sp], #16
pop_v_regs
ret
///******************************************************************************
///**
//*******************************************************************************
//*
//*ih264_intra_pred_luma_16x16_mode_plane
//*
//* @brief
//* Perform Intra prediction for luma_16x16 mode:PLANE
//*
//* @par Description:
//* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//* @param[in] ui_neighboravailability
//* availability of neighbouring pixels
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************/
//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 ui_neighboravailability)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
// x2 => src_strd
// x3 => dst_strd
// x4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_plane_av8
ih264_intra_pred_luma_16x16_mode_plane_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
mov x2, x1
add x1, x0, #17
add x0, x0, #15
mov x8, #9
sub x1, x1, #1
mov x10, x1 //top_left
mov x4, #-1
ld1 {v2.2s}, [x1], x8
adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]
ld1 {v0.2s}, [x1]
rev64 v2.8b, v2.8b
ld1 {v6.2s, v7.2s}, [x7]
usubl v0.8h, v0.8b, v2.8b
uxtl v16.8h, v6.8b
mul v0.8h, v0.8h , v16.8h
uxtl v18.8h, v7.8b
add x7, x0, x4, lsl #3
sub x0, x7, x4, lsl #1
sub x20, x4, #0x0
neg x14, x20
addp v0.8h, v0.8h, v1.8h
ldrb w8, [x7], #-1
sxtw x8, w8
ldrb w9, [x0], #1
sxtw x9, w9
saddlp v0.2s, v0.4h
sub x12, x8, x9
ldrb w8, [x7], #-1
sxtw x8, w8
saddlp v0.1d, v0.2s
ldrb w9, [x0], #1
sxtw x9, w9
sub x8, x8, x9
shl v2.2s, v0.2s, #2
add x12, x12, x8, lsl #1
add v0.2s, v0.2s , v2.2s
ldrb w8, [x7], #-1
sxtw x8, w8
ldrb w9, [x0], #1
sxtw x9, w9
srshr v0.2s, v0.2s, #6 // i_b = D0[0]
sub x8, x8, x9
ldrb w5, [x7], #-1
sxtw x5, w5
add x8, x8, x8, lsl #1
dup v4.8h, v0.h[0]
add x12, x12, x8
ldrb w9, [x0], #1
sxtw x9, w9
mul v0.8h, v4.8h , v16.8h
sub x5, x5, x9
mul v2.8h, v4.8h , v18.8h
add x12, x12, x5, lsl #2
ldrb w8, [x7], #-1
sxtw x8, w8
ldrb w9, [x0], #1
sxtw x9, w9
sub x8, x8, x9
ldrb w5, [x7], #-1
sxtw x5, w5
add x8, x8, x8, lsl #2
ldrb w6, [x0], #1
sxtw x6, w6
add x12, x12, x8
ldrb w8, [x7], #-1
sxtw x8, w8
ldrb w9, [x0], #1
sxtw x9, w9
sub x5, x5, x6
sub x8, x8, x9
add x5, x5, x5, lsl #1
sub x20, x8, x8, lsl #3
neg x8, x20
add x12, x12, x5, lsl #1
ldrb w5, [x7], #-1
sxtw x5, w5
ldrb w6, [x10] //top_left
sxtw x6, w6
add x12, x12, x8
sub x9, x5, x6
ldrb w6, [x1, #7]
sxtw x6, w6
add x12, x12, x9, lsl #3 // i_c = x12
add x8, x5, x6
add x12, x12, x12, lsl #2
lsl x8, x8, #4 // i_a = x8
add x12, x12, #0x20
lsr x12, x12, #6
shl v28.8h, v4.8h, #3
dup v6.8h, w12
dup v30.8h, w8
shl v26.8h, v6.8h, #3
sub v30.8h, v30.8h , v28.8h
sub v30.8h, v30.8h , v26.8h
add v28.8h, v30.8h , v6.8h
add v26.8h, v28.8h , v0.8h
add v28.8h, v28.8h , v2.8h
sqrshrun v20.8b, v26.8h, #5
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v20.8b, v26.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
sqrshrun v21.8b, v28.8h, #5
add v26.8h, v26.8h , v6.8h
add v28.8h, v28.8h , v6.8h
sqrshrun v22.8b, v26.8h, #5
st1 {v20.2s, v21.2s}, [x2], x3
sqrshrun v23.8b, v28.8h, #5
st1 {v22.2s, v23.2s}, [x2], x3
end_func_plane:
ldp x19, x20, [sp], #16
pop_v_regs
ret