blob: 215f8fd1297e30417787f0d224e3f74a071a47c3 [file] [log] [blame]
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@******************************************************************************
@* @file
@* ihevc_inter_pred_luma_horz.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* parthiban v
@*
@* @par list of functions:
@*
@* - ihevc_inter_pred_luma_horz()
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
@/* include reconstruction */
@
@/**
@*******************************************************************************
@*
@* @brief
@* interprediction luma filter for vertical input
@*
@* @par description:
@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
@* the elements pointed by 'pu1_src' and writes to the location pointed by
@* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
@* assumptions : the function is optimized considering the fact width is
@* multiple of 4 or 8. and height as multiple of 2.
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pu1_dst
@* uword8 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_luma_horz (
@ uword8 *pu1_src,
@ uword8 *pu1_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd )
@**************variables vs registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => *pi1_coeff
@ r5 => ht
@ r6 => wd
.text
.align 4
.globl ihevc_inter_pred_luma_horz_a9q
.type ihevc_inter_pred_luma_horz_a9q, %function
ihevc_inter_pred_luma_horz_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
@str r1,[sp,#-4]
@ mov r7,#8192
start_loop_count:
@ ldr r1,[sp,#-4]
ldr r4,[sp,#40] @loads pi1_coeff
ldr r8,[sp,#44] @loads ht
ldr r10,[sp,#48] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
mov r11,#1
subs r14,r8,#0 @checks for ht == 0
vabs.s8 d2,d0 @vabs_s8(coeff)
@ble end_loops
vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub r12,r0,#3 @pu1_src - 3
vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
rsb r9,r10,r2,lsl #1 @2*src_strd - wd
vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
rsb r8,r10,r3,lsl #1 @2*dst_strd - wd
vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
@ tst r10,#7 @checks wd for multiples
vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
mov r7,r1
cmp r10,#4
ble outer_loop_4
cmp r10,#24
moveq r10,#16
addeq r8,#8
addeq r9,#8
cmp r10,#16
bge outer_loop_16
cmp r10,#12
addeq r8,#4
addeq r9,#4
b outer_loop_8
outer_loop8_residual:
sub r12,r0,#3 @pu1_src - 3
mov r1,r7
mov r14,#32
add r1,#16
add r12,#16
mov r10,#8
add r8,#8
add r9,#8
outer_loop_8:
add r6,r1,r3 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
subs r5,r10,#0 @checks wd
ble end_inner_loop_8
inner_loop_8:
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11
vld1.u32 {d2},[r12],r11
vld1.u32 {d3},[r12],r11
@ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
@ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
@ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6]
@ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
@ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
@ vext.u8 d14,d12,d13,#2
@vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
@vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
@vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
vld1.u32 {d4},[r12],r11
vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {d5},[r12],r11
vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d6},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d7},[r12],r11
vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vld1.u32 {d13},[r4],r11
vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vld1.u32 {d14},[r4],r11
vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vld1.u32 {d15},[r4],r11
vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd
vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d17},[r4],r11
vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d18},[r4],r11
vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd
vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1
vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
vst1.8 {d20},[r1]! @store the result pu1_dst
vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2
subs r5,r5,#8 @decrement the wd loop
vst1.8 {d8},[r6]! @store the result pu1_dst
cmp r5,#4
bgt inner_loop_8
end_inner_loop_8:
subs r14,r14,#2 @decrement the ht loop
add r12,r12,r9 @increment the src pointer by 2*src_strd-wd
add r1,r1,r8 @increment the dst pointer by 2*dst_strd-wd
bgt outer_loop_8
ldr r10,[sp,#48] @loads wd
cmp r10,#12
beq outer_loop4_residual
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop_16:
str r0, [sp, #-4]!
str r7, [sp, #-4]!
add r6,r1,r3 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
and r0, r12, #31
sub r5,r10,#0 @checks wd
@ble end_loops1
pld [r12, r2, lsl #1]
vld1.u32 {q0},[r12],r11 @vector load pu1_src
pld [r4, r2, lsl #1]
vld1.u32 {q1},[r12],r11
vld1.u32 {q2},[r12],r11
vld1.u32 {q3},[r12],r11
vld1.u32 {q6},[r12],r11
vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q7},[r12],r11
vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q8},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q9},[r12],r11
vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
inner_loop_16:
subs r5,r5,#16
vmull.u8 q10,d3,d25
add r12,#8
vmlsl.u8 q10,d1,d24
subeq r14,r14,#2
vmlal.u8 q10,d7,d27
vld1.u32 {q0},[r4],r11 @vector load pu1_src
vmlsl.u8 q10,d5,d26
vld1.u32 {q1},[r4],r11
vmlal.u8 q10,d13,d28
vld1.u32 {q2},[r4],r11
vmlal.u8 q10,d17,d30
vld1.u32 {q3},[r4],r11
vmlsl.u8 q10,d15,d29
vld1.u32 {q6},[r4],r11
vmlsl.u8 q10,d19,d31
vld1.u32 {q7},[r4],r11
vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1
vld1.u32 {q8},[r4],r11
vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q9},[r4],r11
vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
pld [r12, r2, lsl #2]
pld [r4, r2, lsl #2]
add r4,#8
vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd
vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
addeq r4,r12,r2 @pu1_src + src_strd
vqrshrun.s16 d9,q10,#6
vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
@ and r7, r12, #31
vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
vmull.u8 q11,d3,d25
vmlsl.u8 q11,d1,d24
vst1.8 {q4},[r1]! @store the result pu1_dst
vmlal.u8 q11,d7,d27
addeq r1,r1,r8
vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2
@ cmp r7, r0
vmlsl.u8 q11,d5,d26
vmlal.u8 q11,d13,d28
vmlal.u8 q11,d17,d30
@ mov r0, r7
vmlsl.u8 q11,d15,d29
cmp r14,#0
vmlsl.u8 q11,d19,d31
beq epilog_16
vld1.u32 {q0},[r12],r11 @vector load pu1_src
vld1.u32 {q1},[r12],r11
vld1.u32 {q2},[r12],r11
vld1.u32 {q3},[r12],r11
vld1.u32 {q6},[r12],r11
vqrshrun.s16 d11,q11,#6
vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q7},[r12],r11
vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q8},[r12],r11
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q9},[r12],r11
vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
cmp r5,#0
vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
moveq r5,r10
vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
vst1.8 {q5},[r6]! @store the result pu1_dst
vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
addeq r6,r1,r3 @pu1_dst + dst_strd
b inner_loop_16
epilog_16:
vqrshrun.s16 d11,q11,#6
vst1.8 {q5},[r6]! @store the result pu1_dst
ldr r7, [sp], #4
ldr r0, [sp], #4
ldr r10,[sp,#48]
cmp r10,#24
beq outer_loop8_residual
end_loops1:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp
outer_loop4_residual:
sub r12,r0,#3 @pu1_src - 3
mov r1,r7
add r1,#8
mov r10,#4
add r12,#8
mov r14,#16
add r8,#4
add r9,#4
outer_loop_4:
add r6,r1,r3 @pu1_dst + dst_strd
add r4,r12,r2 @pu1_src + src_strd
subs r5,r10,#0 @checks wd
ble end_inner_loop_4
inner_loop_4:
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11
vld1.u32 {d2},[r12],r11
vld1.u32 {d3},[r12],r11
vld1.u32 {d4},[r12],r11
vld1.u32 {d5},[r12],r11
vld1.u32 {d6},[r12],r11
vld1.u32 {d7},[r12],r11
@add r12,r12,#4 @increment the input pointer
sub r12,r12,#4
@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
@vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
@vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
vld1.u32 {d13},[r4],r11
vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
vld1.u32 {d14},[r4],r11
vzip.32 d1,d13
vld1.u32 {d15},[r4],r11
vzip.32 d2,d14
vld1.u32 {d16},[r4],r11
vzip.32 d3,d15
vld1.u32 {d17},[r4],r11
vzip.32 d4,d16
vld1.u32 {d18},[r4],r11
vzip.32 d5,d17
vld1.u32 {d19},[r4],r11
sub r4,r4,#4
@ add r4,r4,#4 @increment the input pointer
@ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
@ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
@ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
@ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
vzip.32 d6,d18
vzip.32 d7,d19
vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
vmlsl.u8 q4,d0,d24
vmlsl.u8 q4,d2,d26
vmlal.u8 q4,d3,d27
vmlal.u8 q4,d4,d28
vmlsl.u8 q4,d5,d29
vmlal.u8 q4,d6,d30
vmlsl.u8 q4,d7,d31
vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register
vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register
subs r5,r5,#4 @decrement the wd by 4
bgt inner_loop_4
end_inner_loop_4:
subs r14,r14,#2 @decrement the ht by 4
add r12,r12,r9 @increment the input pointer 2*src_strd-wd
add r1,r1,r8 @increment the output pointer 2*dst_strd-wd
bgt outer_loop_4
@subs r7,r7,#1
@ bgt start_loop_count
ldmfd sp!,{r4-r12,r15} @reload the registers from sp