blob: a60bb08a78ce72ff4ce0387c6c2977f3d944ea40 [file] [log] [blame]
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@/**
@******************************************************************************
@* @file
@* ihevc_inter_pred_luma_horz_w16out.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* parthiban v
@*
@* @par list of functions:
@*
@* - ihevc_inter_pred_luma_horz_w16out()
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@* interprediction luma filter for horizontal 16bit output
@*
@* @par description:
@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
@* to the elements pointed by 'pu1_src' and writes to the location pointed
@* by 'pu1_dst' no downshifting or clipping is done and the output is used
@* as an input for vertical filtering or weighted prediction assumptions :
@* the function is optimized considering the fact width is multiple of 4 or
@* 8. if width is multiple of 4 then height should be multiple of 2, width 8
@* is optimized further.
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pi2_dst
@* word16 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
@ word16 *pi2_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd
@r0 - free
@r1 - dst_ptr
@r2 - src_strd
@r3 - dst_strd
@r4 - src_ptr2
@r5 - inner loop counter
@r6 - dst_ptr2
@r7 - free
@r8 - dst_strd2
@r9 - src_strd1
@r10 - wd
@r11 - #1
@r12 - src_ptr1
@r14 - loop_counter
.equ coeff_offset, 104
.equ ht_offset, 108
.equ wd_offset, 112
.text
.align 4
.syntax unified
.globl ihevc_inter_pred_luma_horz_w16out_a9q
.type ihevc_inter_pred_luma_horz_w16out_a9q, %function
ihevc_inter_pred_luma_horz_w16out_a9q:
bic r14, #1 @ clearing bit[0], so that it goes back to mode
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
vpush {d8 - d15}
ldr r4,[sp,#coeff_offset] @loads pi1_coeff
ldr r7,[sp,#ht_offset] @loads ht
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
sub r14,r7,#0 @checks for ht == 0
vabs.s8 d2,d0 @vabs_s8(coeff)
mov r11,#1
ldr r10,[sp,#wd_offset] @loads wd
vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub r12,r0,#3 @pu1_src - 3
vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
rsb r9,r10,r2,lsl #1 @2*src_strd - wd
vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
rsb r8,r10,r3 @dst_strd - wd
vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1)
vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside)
vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
cmp r7,#1
beq odd_height_decision
even_height_decision:
mov r7,r1
cmp r10,#4
ble outer_loop_4
cmp r10,#24
moveq r10,#16
addeq r8,#8
addeq r9,#8
cmp r10,#16
bge outer_loop_16_branch
cmp r10,#12
addeq r8,#4
addeq r9,#4
outer_loop_8_branch:
b outer_loop_8
outer_loop_16_branch:
b outer_loop_16
odd_height_decision:
cmp r10,#24
beq outer_loop_8_branch
cmp r10,#12
beq outer_loop_4
b even_height_decision
outer_loop4_residual:
sub r12,r0,#3 @pu1_src - 3
mov r1,r7
add r1,#16
mov r10,#4
add r12,#8
mov r14,#16
add r8,#4
add r9,#4
outer_loop_4:
add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
subs r5,r10,#0 @checks wd
ble end_inner_loop_4
inner_loop_4:
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11
vld1.u32 {d2},[r12],r11
vld1.u32 {d3},[r12],r11
vld1.u32 {d4},[r12],r11
vld1.u32 {d5},[r12],r11
vld1.u32 {d6},[r12],r11
vld1.u32 {d7},[r12],r11
@add r12,r12,#4 @increment the input pointer
sub r12,r12,#4
@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
@vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
@vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
vld1.u32 {d13},[r4],r11
vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
vld1.u32 {d14},[r4],r11
vzip.32 d1,d13
vld1.u32 {d15},[r4],r11
vzip.32 d2,d14
vld1.u32 {d16},[r4],r11
vzip.32 d3,d15
vld1.u32 {d17},[r4],r11
vzip.32 d4,d16
vld1.u32 {d18},[r4],r11
vzip.32 d5,d17
vld1.u32 {d19},[r4],r11
sub r4,r4,#4
@ add r4,r4,#4 @increment the input pointer
@ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
@ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
@ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
@ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
vzip.32 d6,d18
vzip.32 d7,d19
vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
vmlsl.u8 q4,d0,d24
vmlsl.u8 q4,d2,d26
vmlal.u8 q4,d3,d27
vmlal.u8 q4,d4,d28
vmlsl.u8 q4,d5,d29
vmlal.u8 q4,d6,d30
vmlsl.u8 q4,d7,d31
@ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register
vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register
subs r5,r5,#4 @decrement the wd by 4
bgt inner_loop_4
end_inner_loop_4:
subs r14,r14,#2 @decrement the ht by 4
add r12,r12,r9 @increment the input pointer 2*src_strd-wd
add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd
bgt outer_loop_4
height_residue_4:
ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
cmp r7,#0
beq end_loops
outer_loop_height_residue_4:
subs r5,r10,#0 @checks wd
ble end_inner_loop_height_residue_4
inner_loop_height_residue_4:
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11
@ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
@ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@add r12,r12,#4 @increment the input pointer
@ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
@ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
@ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
@ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
vld1.u32 {d2},[r12],r11
vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
vld1.u32 {d3},[r12],r11
vmlsl.u8 q4,d0,d24
vld1.u32 {d4},[r12],r11
vmlsl.u8 q4,d2,d26
vld1.u32 {d5},[r12],r11
vmlal.u8 q4,d3,d27
vld1.u32 {d6},[r12],r11
vmlal.u8 q4,d4,d28
vld1.u32 {d7},[r12],r11
vmlsl.u8 q4,d5,d29
sub r12,r12,#4
vmlal.u8 q4,d6,d30
vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register
subs r5,r5,#4 @decrement the wd by 4
vst1.64 {d8},[r1]!
bgt inner_loop_height_residue_4
end_inner_loop_height_residue_4:
subs r7,r7,#1 @decrement the ht by 4
rsb r9,r10,r2
add r12,r12,r9 @increment the input pointer src_strd-wd
add r1,r1,r8 @increment the output pointer dst_strd-wd
bgt outer_loop_height_residue_4
vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop8_residual:
sub r12,r0,#3 @pu1_src - 3
mov r1,r7
mov r14,#32
add r1,#32
add r12,#16
mov r10,#8
add r8,#8
add r9,#8
outer_loop_8:
add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
subs r5,r10,#0 @checks wd
ble end_inner_loop_8
inner_loop_8:
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11
vld1.u32 {d2},[r12],r11
vld1.u32 {d3},[r12],r11
@ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
@ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
@ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6]
@ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
@ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
@ vext.u8 d14,d12,d13,#2
@vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
@vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
@vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
vld1.u32 {d4},[r12],r11
vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {d5},[r12],r11
vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d6},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d7},[r12],r11
vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vld1.u32 {d13},[r4],r11
vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vld1.u32 {d14},[r4],r11
vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vld1.u32 {d15},[r4],r11
vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd
vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d17},[r4],r11
vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d18},[r4],r11
vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd
vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
@ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1
vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
vst1.16 {q4},[r1]! @store the result pu1_dst
vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
@ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2
subs r5,r5,#8 @decrement the wd loop
vst1.16 {q5},[r6]! @store the result pu1_dst
cmp r5,#4
bgt inner_loop_8
end_inner_loop_8:
subs r14,r14,#2 @decrement the ht loop
add r12,r12,r9 @increment the src pointer by 2*src_strd-wd
add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd
bgt outer_loop_8
ldr r10,[sp,#wd_offset] @loads wd
cmp r10,#12
beq outer_loop4_residual
ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop_16:
str r0, [sp, #-4]!
str r7, [sp, #-4]!
add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
and r0, r12, #31
sub r5,r10,#0 @checks wd
pld [r12, r2, lsl #1]
vld1.u32 {q0},[r12],r11 @vector load pu1_src
pld [r4, r2, lsl #1]
vld1.u32 {q1},[r12],r11
vld1.u32 {q2},[r12],r11
vld1.u32 {q3},[r12],r11
vld1.u32 {q6},[r12],r11
vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q7},[r12],r11
vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q8},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q9},[r12],r11
vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
inner_loop_16:
subs r5,r5,#16
vmull.u8 q10,d3,d25
add r12,#8
vmlsl.u8 q10,d1,d24
vld1.u32 {q0},[r4],r11 @vector load pu1_src
vmlal.u8 q10,d7,d27
vld1.u32 {q1},[r4],r11
vmlsl.u8 q10,d5,d26
vld1.u32 {q2},[r4],r11
vmlal.u8 q10,d13,d28
vld1.u32 {q3},[r4],r11
vmlal.u8 q10,d17,d30
vld1.u32 {q6},[r4],r11
vmlsl.u8 q10,d15,d29
vld1.u32 {q7},[r4],r11
vmlsl.u8 q10,d19,d31
vld1.u32 {q8},[r4],r11
vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q9},[r4],r11
vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
add r4,#8
vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
pld [r12, r2, lsl #2]
pld [r4, r2, lsl #2]
vst1.8 {q4},[r1]! @store the result pu1_dst
vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd
vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
addeq r4,r12,r2 @pu1_src + src_strd
vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
@ and r7, r12, #31
vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
subeq r14,r14,#2
vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
@cmp r7, r0
vmull.u8 q11,d3,d25
@ pld [r12, r2, lsl #2]
vmlsl.u8 q11,d1,d24
vst1.16 {q10},[r1]!
vmlal.u8 q11,d7,d27
@ pld [r4, r2, lsl #2]
vmlsl.u8 q11,d5,d26
@ mov r0, r7
vmlal.u8 q11,d13,d28
cmp r14,#0
vmlal.u8 q11,d17,d30
vst1.16 {q5},[r6]!
vmlsl.u8 q11,d15,d29
vmlsl.u8 q11,d19,d31
beq epilog_16
vld1.u32 {q0},[r12],r11 @vector load pu1_src
vld1.u32 {q1},[r12],r11
vld1.u32 {q2},[r12],r11
vld1.u32 {q3},[r12],r11
vld1.u32 {q6},[r12],r11
vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q7},[r12],r11
vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q8},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q9},[r12],r11
vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
cmp r5,#0
vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
moveq r5,r10
vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vst1.8 {q11},[r6]! @store the result pu1_dst
vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
addeq r1,r6,r8,lsl #1
addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd
b inner_loop_16
epilog_16:
@ vqrshrun.s16 d11,q11,#6
vst1.8 {q11},[r6]! @store the result pu1_dst
ldr r7, [sp], #4
ldr r0, [sp], #4
ldr r10,[sp,#wd_offset]
cmp r10,#24
beq outer_loop8_residual
add r1,r6,r8,lsl #1
ldr r7,[sp,#ht_offset] @loads ht
and r7,r7,#1
cmp r7,#1
beq height_residue_4
end_loops:
vpop {d8 - d15}
ldmfd sp!,{r4-r12,r15} @reload the registers from sp