blob: f95937c67fc1206690e22570d7dac8494aace3fe [file] [log] [blame]
@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@* ihevc_inter_pred_chroma_horz_neon.s
@*
@* @brief
@* contains function definitions for inter prediction interpolation.
@* functions are coded using neon intrinsics and can be compiled using
@* rvct
@*
@* @author
@* yogeswaran rs / akshaya mukund
@*
@* @par list of functions:
@*
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@* chroma interprediction filter to store horizontal 16bit ouput
@*
@* @par description:
@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
@* to the elements pointed by 'pu1_src' and writes to the location pointed
@* by 'pu1_dst' no downshifting or clipping is done and the output is used
@* as an input for vertical filtering or weighted prediction
@*
@* @param[in] pu1_src
@* uword8 pointer to the source
@*
@* @param[out] pi2_dst
@* word16 pointer to the destination
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] pi1_coeff
@* word8 pointer to the filter coefficients
@*
@* @param[in] ht
@* integer height of the array
@*
@* @param[in] wd
@* integer width of the array
@*
@* @returns
@*
@* @remarks
@* none
@*
@*******************************************************************************
@*/
@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
@ word16 *pi2_dst,
@ word32 src_strd,
@ word32 dst_strd,
@ word8 *pi1_coeff,
@ word32 ht,
@ word32 wd)
@**************variables vs registers*****************************************
@r0 => *pu1_src
@r1 => *pi2_dst
@r2 => src_strd
@r3 => dst_strd
.text
.align 4
.globl ihevc_inter_pred_chroma_horz_w16out_a9q
.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
ihevc_inter_pred_chroma_horz_w16out_a9q:
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
ldr r4,[sp,#40] @loads pi1_coeff
ldr r6,[sp,#44] @loads ht
ldr r10,[sp,#48] @loads wd
vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
subs r14,r6,#0 @checks for ht == 0
vabs.s8 d2,d0 @vabs_s8(coeff)
@******* added
mov r11, #2
@******* added ends
ble end_loops
vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
sub r12,r0,#2 @pu1_src - 2
vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
tst r10,#3 @checks wd for multiples of 4
mov r5,r10,lsl #1 @2wd
vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
and r7,r14,#1 @added @calculating ht_residue ht_residue = (ht & 1)
sub r14,r14,r7 @added @decrement height by ht_residue(residue value is calculated outside)
bne outer_loop_4 @ this branching happens when the width is 2 or 6
cmp r10,#12
beq skip_16
cmp r10,#8
bge outer_loop_16
skip_16:
tst r6,#3
@******* removal
@mov r11,#8
@******* removal ends
sub r9,r0,#2
beq outer_loop_ht_4 @this branching happens when the height is a a multiple of 4
@ cmp r10,#12
@ beq outer_loop_8
@ cmp r10,#16
@ bge outer_loop_16
b outer_loop_8
outer_loop_16:
add r4,r12,r2
and r0, r12, #31
pld [r12, r2, lsl #1]
vld1.u32 {q0},[r12],r11 @vector load pu1_src
mov r10,r5 @2wd
mul r14,r14,r10
vld1.u32 {q1},[r12],r11 @vector load pu1_src
pld [r4, r2, lsl #1]
mov r9,#10
vld1.u32 {q2},[r12],r11 @vector load pu1_src
rsb r6,r3,#8
sub r8,r3,#8
vld1.u32 {q3},[r12],r9 @vector load pu1_src
vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q4},[r4],r11 @vector load pu1_src
vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q5},[r4],r11 @vector load pu1_src
vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q6},[r4],r11 @vector load pu1_src
vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q7},[r4],r9 @vector load pu1_src
vmull.u8 q14,d3,d25
lsl r6,#1
rsb r3,r5,r3,lsl #1
vmlsl.u8 q14,d1,d24
lsl r8,#1
rsb r7,r5,r2,lsl #1
vmlal.u8 q14,d5,d26
vmlsl.u8 q14,d7,d27
cmp r14,#32
beq epilog_end
sub r14,#64
inner_loop_16:
@ and r7, r12, #31 @decrement the wd loop
@ cmp r7, r0
pld [r12, r2, lsl #2]
pld [r4, r2, lsl #2]
subs r10,r10,#16
vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
@ addeq r12,r12,r2,lsl #1
@ subeq r12,r12,r5
addeq r12,r12,r7
addeq r4,r12,r2
vst1.16 {q15}, [r1]!
vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q0},[r12],r11 @vector load pu1_src
vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q1},[r12],r11 @vector load pu1_src
vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q2},[r12],r11 @vector load pu1_src
vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vst1.16 {q14}, [r1],r8
vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q3},[r12],r9 @vector load pu1_src
vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q4},[r4],r11 @vector load pu1_src
vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q5},[r4],r11 @vector load pu1_src
vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q6},[r4],r11 @vector load pu1_src
vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q7},[r4],r9 @vector load pu1_src
vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vst1.16 {q11},[r1]! @store the result pu1_dst
vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
moveq r10,r5 @2wd
vmull.u8 q14,d3,d25
vmlsl.u8 q14,d1,d24
vst1.16 {q10},[r1],r6 @store the result pu1_dst
addeq r1,r1,r3,lsl #1
vmlal.u8 q14,d5,d26
subs r14,r14,#32 @decrement the ht loop
vmlsl.u8 q14,d7,d27
@ mov r0, r7
bgt inner_loop_16
add r14,r14,#64
cmp r14,#32
beq epilog_end
epilog:
vst1.16 {q15}, [r1]!
vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vst1.16 {q14}, [r1],r8
vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
subs r10,r10,#16 @decrement the wd loop
vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
@ addeq r12,r12,r2,lsl #1
addeq r12,r12,r7
vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
@ subeq r12,r12,r5
moveq r10,r5 @2wd
addeq r4,r12,r2
vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q0},[r12],r11 @vector load pu1_src
vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q1},[r12],r11 @vector load pu1_src
vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {q2},[r12],r11 @vector load pu1_src
vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q3},[r12],r9 @vector load pu1_src
vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {q4},[r4],r11 @vector load pu1_src
vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {q5},[r4],r11 @vector load pu1_src
vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {q6},[r4],r11 @vector load pu1_src
vmull.u8 q14,d3,d25
vld1.u32 {q7},[r4],r9 @vector load pu1_src
vmlsl.u8 q14,d1,d24
vst1.16 {q11},[r1]! @store the result pu1_dst
vmlal.u8 q14,d5,d26
vst1.16 {q10},[r1],r6 @store the result pu1_dst
vmlsl.u8 q14,d7,d27
addeq r1,r1,r3,lsl #1
epilog_end:
vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {q15}, [r1]!
vst1.16 {q14}, [r1],r8
vst1.16 {q11},[r1]! @store the result pu1_dst
vst1.16 {q10},[r1],r6 @store the result pu1_dst
ldr r6,[sp,#44] @loads ht
and r7,r6,#1
cmp r7,#0
mov r10,r5
addne r12,r12,r2,lsl #1
subne r12,r12,r5
addne r1,r1,r3,lsl #1
bgt loop_residue_4
b end_loops
outer_loop_8:
add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
mov r10,r5 @2wd
add r4,r12,r2 @pu1_src + src_strd
inner_loop_8:
@vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11 @vector load pu1_src
vld1.u32 {d2},[r12],r11 @vector load pu1_src
vld1.u32 {d3},[r12],r11 @vector load pu1_src
@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
@vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd
vld1.u32 {d4},[r4],r11 @vector load pu1_src
vld1.u32 {d5},[r4],r11 @vector load pu1_src
vld1.u32 {d6},[r4],r11 @vector load pu1_src
vld1.u32 {d7},[r4],r11 @vector load pu1_src
@vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
@vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d8, d9}, [r1]!
subs r10,r10,#8 @decrement the wd loop
vst1.16 {d10, d11},[r6]! @store the result pu1_dst
bgt inner_loop_8
sub r12,r12,r5
subs r14,r14,#2 @decrement the ht loop
sub r1,r1,r5,lsl #1
add r12,r12,r2,lsl #1
add r1,r1,r3,lsl #2
bgt outer_loop_8
cmp r7,#0
mov r10,r5
bgt loop_residue_4
b end_loops
@height if 4 comes
outer_loop_ht_4:
mov r10,r5
prologue_ht_4:
mov r8,r3,lsl #1
inner_loop_ht_4:
mov r12,r9
mov r4,r1
sub r0, r2, #6 @ not sure if r0 needs to be preserved
vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src
vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src
vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src
vld1.u32 {d3},[r12],r0 @(1)vector load pu1_src
vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src
vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src
vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src
vld1.u32 {d7},[r12],r0 @(2)vector load pu1_src
vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src
vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src
vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src
vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d17},[r12],r0 @(3)vector load pu1_src
vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src
vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src
vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src
vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src
vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
add r9,r9,#8 @(core loop)
subs r10,r10,#8 @(prologue)decrement the wd loop
beq epilogue
core_loop:
vst1.16 {d8, d9},[r4],r8 @(1)store the result pu1_dst
mov r12,r9
vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src
vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src
vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src
vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d3},[r12],r0 @(1_1)vector load pu1_src
vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d10, d11},[r4],r8 @(2)store the result pu1_dst
add r9,r9,#8 @(core loop)
vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src
vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src
vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src
vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d7},[r12],r0 @(2_1)vector load pu1_src
vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d12, d13},[r4],r8 @(3)store the result pu1_dst
add r1,r1,#16 @(core loop)
vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src
vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src
vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src
vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vld1.u32 {d17},[r12],r0 @(3_1)vector load pu1_src
vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d22, d23}, [r4], r8 @(4)store the result pu1_dst
subs r10,r10,#8 @(core loop)
vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src
vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src
vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src
vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
mov r4, r1 @(core loop)
vld1.u32 {d21},[r12],r0 @(4_1)vector load pu1_src
vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
bgt core_loop @loopback
epilogue:
vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d8, d9},[r4], r8 @(1)store the result pu1_dst
vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vst1.16 {d10, d11},[r4], r8 @(2)store the result pu1_dst
vst1.16 {d12, d13},[r4], r8 @(3)store the result pu1_dst
add r1,r1,#16 @(core loop)
vst1.16 {d22, d23},[r4], r8 @(4)store the result pu1_dst
sub r9,r9,r5
subs r14,r14,#4 @decrement the ht loop
sub r1,r1,r5,lsl #1
add r9,r9,r2,lsl #2
add r1,r1,r3,lsl #3
bgt outer_loop_ht_4
cmp r7,#0
mov r10,r5
movgt r12,r9
movgt r4,r1
bgt loop_residue_4
b end_loops
outer_loop_4:
add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
mov r10,r5
add r4,r12,r2 @pu1_src + src_strd
inner_loop_4:
@vld1.u32 {d0,d1},[r12] @vector load pu1_src
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11 @vector load pu1_src
vld1.u32 {d2},[r12],r11 @vector load pu1_src
vld1.u32 {d3},[r12] @vector load pu1_src
@**** removal
@add r12,r12,#4 @increment the input pointer
@**** removal ends
@**** addn
sub r12,r12,#2 @increment the input pointer
@**** addn ends
vld1.u32 {d4},[r4],r11 @vector load pu1_src
vld1.u32 {d5},[r4],r11 @vector load pu1_src
vld1.u32 {d6},[r4],r11 @vector load pu1_src
vld1.u32 {d7},[r4] @vector load pu1_src
@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd
@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
@add r4,r4,#4 @increment the input pointer
sub r4,r4,#2
@vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
@vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
@vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
@**** removal
@vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
@vzip.32 d2,d14
@vzip.32 d4,d16
@vzip.32 d6,d18
@**** removal ends
@**** addn
vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register
vzip.32 d1,d5
vzip.32 d2,d6
vzip.32 d3,d7
@**** addn ends
vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
vmlsl.u8 q4,d0,d24
vmlal.u8 q4,d2,d26
vmlsl.u8 q4,d3,d27
vst1.32 {d8},[r1]! @store the i iteration result which is in upper part of the register
subs r10,r10,#4 @decrement the wd by 4
vst1.32 {d9},[r6]! @store the ii iteration result which is in lower part of the register
bgt inner_loop_4
sub r12,r12,r5
subs r14,r14,#2 @decrement the ht by 2
sub r1,r1,r5,lsl #1
add r12,r12,r2,lsl #1
add r1,r1,r3,lsl #2
bgt outer_loop_4
cmp r7,#0
mov r10,r5
beq end_loops
loop_residue_4:
mov r10,r5 @2wd
loop_residue:
@vld1.u32 {d0,d1},[r12] @vector load pu1_src
vld1.u32 {d0},[r12],r11 @vector load pu1_src
vld1.u32 {d1},[r12],r11 @vector load pu1_src
vld1.u32 {d2},[r12],r11 @vector load pu1_src
vld1.u32 {d3},[r12] @vector load pu1_src
@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
@vmull.u8 q4,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
@vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
@add r12,r12,#4 @pu1_src + 4
sub r12, r12, #2
@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
@vmlal.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
@vmlsl.u8 q4,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
vmull.u8 q4,d1,d25
vmlsl.u8 q4,d0,d24
vmlal.u8 q4,d2,d26
vmlsl.u8 q4,d3,d27
vst1.64 {d8 },[r1] @store the result pu1_dst
subs r10,r10,#4 @decrement the wd loop
add r1,r1,#8 @pi2_dst + 8
bgt loop_residue @loop again
@inner loop ends
@add r8,r3,lsl #1 @2*dst_strd
@sub r8,r8,r5,lsl #1 @2*dst_strd - 2wd
@sub r9,r2,r5 @src_strd - 2wd
@subs r7,r7,#1 @decrement the ht loop
@add r12,r12,r9 @pu1_src + src_strd
@add r1,r1,r8 @pu1_dst + 2*dst_strd
@bgt outer_loop_residue_4 @loop again
@b end_loops @jumps to end
end_loops:
ldmfd sp!,{r4-r12,r15} @reload the registers from sp