| @/***************************************************************************** |
| @* |
| @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| @* |
| @* Licensed under the Apache License, Version 2.0 (the "License"); |
| @* you may not use this file except in compliance with the License. |
| @* You may obtain a copy of the License at: |
| @* |
| @* http://www.apache.org/licenses/LICENSE-2.0 |
| @* |
| @* Unless required by applicable law or agreed to in writing, software |
| @* distributed under the License is distributed on an "AS IS" BASIS, |
| @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @* See the License for the specific language governing permissions and |
| @* limitations under the License. |
| @* |
| @*****************************************************************************/ |
| @/** |
| @******************************************************************************* |
| @* @file |
| @* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s |
| @* |
| @* @brief |
| @* contains function definitions for inter prediction interpolation. |
| @* functions are coded using neon intrinsics and can be compiled using |
| |
| @* rvct |
| @* |
| @* @author |
| @* yogeswaran rs / parthiban |
| @* |
| @* @par list of functions: |
| @* |
| @* |
| @* @remarks |
| @* none |
| @* |
| @******************************************************************************* |
| @*/ |
| @/** |
| @/** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* chroma interprediction filter for 16bit vertical input and output. |
| @* |
| @* @par description: |
| @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to |
| @* the elements pointed by 'pu1_src' and writes to the location pointed by |
| @* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and |
| @* 8192 is subtracted to store it as a 16 bit number the output is used as |
| @* a input to weighted prediction assumptions : the function is optimized |
| @* considering the fact width and height are multiple of 2. |
| @* |
| @* @param[in] pi2_src |
| @* word16 pointer to the source |
| @* |
| @* @param[out] pi2_dst |
| @* word16 pointer to the destination |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] dst_strd |
| @* integer destination stride |
| @* |
| @* @param[in] pi1_coeff |
| @* word8 pointer to the filter coefficients |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* none |
| @* |
| @******************************************************************************* |
| @*/ |
| @void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src, |
| @ word16 *pi2_dst, |
| @ word32 src_strd, |
| @ word32 dst_strd, |
| @ word8 *pi1_coeff, |
| @ word32 ht, |
| @ word32 wd) |
| @**************variables vs registers***************************************** |
| @r0 => *pu1_src |
| @r1 => *pi2_dst |
| @r2 => src_strd |
| @r3 => dst_strd |
| .text |
| .align 4 |
| |
| |
| |
| |
| .globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q |
| |
| .type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function |
| |
| ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q: |
| |
| stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments |
| |
| ldr r4, [sp,#40] @loads pi1_coeff |
| ldr r6, [sp,#48] @wd |
| lsl r2,r2,#1 @src_strd = 2* src_strd |
| ldr r5,[sp,#44] @loads ht |
| vld1.8 {d0},[r4] @loads pi1_coeff |
| sub r4,r0,r2 @pu1_src - src_strd |
| vmovl.s8 q0,d0 @long the value |
| |
| tst r6,#3 @checks wd == 2 |
| vdup.16 d12,d0[0] @coeff_0 |
| vdup.16 d13,d0[1] @coeff_1 |
| vdup.16 d14,d0[2] @coeff_2 |
| vdup.16 d15,d0[3] @coeff_3 |
| |
| bgt core_loop_ht_2 @jumps to loop handles wd 2 |
| |
| tst r5,#3 @checks ht == mul of 4 |
| beq core_loop_ht_4 @jumps to loop handles ht mul of 4 |
| |
| core_loop_ht_2: |
| lsl r7,r2,#1 @2*src_strd |
| lsl r3,r3,#1 @2*dst_strd |
| lsl r9,r6,#2 @4*wd |
| sub r6,r3,r6,lsl #1 @2*dst_strd - 2*wd |
| sub r8,r7,r9 @2*src_strd - 4*wd |
| mov r12,r9 @4wd |
| |
| inner_loop_ht_2: |
| add r0,r4,r2 @increments pi2_src |
| vld1.16 {d0},[r4]! @loads pu1_src |
| vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0) |
| subs r12,r12,#8 @2wd + 8 |
| vld1.16 {d2},[r0],r2 @loads pi2_src |
| vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0) |
| vld1.16 {d3},[r0],r2 @loads pi2_src |
| vmlal.s16 q0,d2,d13 |
| vld1.16 {d6},[r0],r2 |
| vmlal.s16 q4,d3,d13 |
| vld1.16 {d2},[r0] |
| add r7,r1,r3 @pu1_dst + dst_strd |
| vmlal.s16 q0,d3,d14 |
| vmlal.s16 q4,d6,d14 |
| vmlal.s16 q0,d6,d15 |
| vmlal.s16 q4,d2,d15 |
| vqshrn.s32 d0,q0,#6 @right shift |
| vqshrn.s32 d30,q4,#6 @right shift |
| vst1.32 {d0},[r1]! @stores the loaded value |
| vst1.32 {d30},[r7] @stores the loaded value |
| bgt inner_loop_ht_2 @inner loop -again |
| |
| @inner loop ends |
| subs r5,r5,#2 @increments ht |
| add r1,r1,r6,lsl #1 @pu1_dst += 2*dst_strd - 2*wd |
| mov r12,r9 @4wd |
| add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd |
| bgt inner_loop_ht_2 @loop again |
| |
| b end_loops @jumps to end |
| |
| core_loop_ht_4: |
| lsl r7,r2,#2 @2*src_strd |
| lsl r10,r3,#2 @2*dst_strd |
| mov r11,r6,lsr #1 @divide by 2 |
| sub lr,r10,r6,lsl #1 @2*dst_strd - 2*wd |
| sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd |
| |
| mul r12,r5,r11 @multiply height by width |
| sub r12,#4 @subtract by one for epilog |
| mov r11,r6,lsl #1 @2*wd |
| lsl r3,r3,#1 @2*dst_strd |
| |
| prolog: |
| add r0,r4,r2 @increments pi2_src |
| vld1.16 {d0},[r4]! @loads pu1_src |
| vld1.16 {d1},[r0],r2 @loads pi2_src |
| subs r11,r11,#4 |
| vld1.16 {d2},[r0],r2 @loads pi2_src |
| vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) |
| vld1.16 {d3},[r0],r2 |
| vmlal.s16 q15,d1,d13 |
| vmlal.s16 q15,d2,d14 |
| add r9,r1,r3 @pu1_dst + dst_strd |
| vmlal.s16 q15,d3,d15 |
| |
| vld1.16 {d4},[r0],r2 |
| vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) |
| addle r4,r4,r8 |
| movle r11,r6,lsl #1 |
| vmlal.s16 q14,d2,d13 |
| vmlal.s16 q14,d3,d14 |
| vld1.s16 {d5},[r0],r2 |
| vmlal.s16 q14,d4,d15 |
| |
| vqshrn.s32 d30,q15,#6 @right shift |
| |
| vld1.s16 {d6},[r0],r2 |
| vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) |
| vmlal.s16 q13,d3,d13 |
| vmlal.s16 q13,d4,d14 |
| add r0,r4,r2 |
| vld1.16 {d0},[r4]! @loads pu1_src |
| vmlal.s16 q13,d5,d15 |
| |
| vqshrn.s32 d28,q14,#6 @right shift |
| |
| vld1.16 {d1},[r0],r2 @loads pi2_src |
| vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) |
| vst1.32 {d30},[r1]! @stores the loaded value |
| vmlal.s16 q12,d4,d13 |
| vld1.16 {d2},[r0],r2 @loads pi2_src |
| vmlal.s16 q12,d5,d14 |
| vld1.16 {d3},[r0],r2 |
| vmlal.s16 q12,d6,d15 |
| addle r1,r1,lr,lsl #1 |
| |
| vqshrn.s32 d26,q13,#6 @right shift |
| subs r12,r12,#4 |
| |
| beq epilog @jumps to epilog |
| |
| kernel_4: |
| vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) |
| subs r11,r11,#4 |
| vmlal.s16 q15,d1,d13 |
| vst1.32 {d28},[r9],r3 @stores the loaded value |
| vmlal.s16 q15,d2,d14 |
| vmlal.s16 q15,d3,d15 |
| |
| vqshrn.s32 d24,q12,#6 @right shift |
| |
| vld1.16 {d4},[r0],r2 |
| vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) |
| vmlal.s16 q14,d2,d13 |
| vmlal.s16 q14,d3,d14 |
| vmlal.s16 q14,d4,d15 |
| vst1.32 {d26},[r9],r3 @stores the loaded value |
| addle r4,r4,r8 |
| movle r11,r6,lsl #1 |
| |
| vqshrn.s32 d30,q15,#6 @right shift |
| |
| vld1.s16 {d5},[r0],r2 |
| vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) |
| vld1.s16 {d6},[r0],r2 |
| vmlal.s16 q13,d3,d13 |
| vst1.32 {d24},[r9] @stores the loaded value |
| add r0,r4,r2 |
| vmlal.s16 q13,d4,d14 |
| vld1.16 {d0},[r4]! @loads pu1_src |
| vmlal.s16 q13,d5,d15 |
| |
| vqshrn.s32 d28,q14,#6 @right shift |
| |
| vld1.16 {d1},[r0],r2 @loads pi2_src |
| vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) |
| vld1.16 {d2},[r0],r2 @loads pi2_src |
| vmlal.s16 q12,d4,d13 |
| add r9,r1,r3 @pu1_dst + dst_strd |
| vld1.16 {d3},[r0],r2 |
| vmlal.s16 q12,d5,d14 |
| |
| vst1.32 {d30},[r1]! @stores the loaded value |
| vmlal.s16 q12,d6,d15 |
| |
| vqshrn.s32 d26,q13,#6 @right shift |
| addle r1,r1,lr,lsl #1 |
| |
| subs r12,r12,#4 |
| |
| bgt kernel_4 @jumps to kernel_4 |
| |
| epilog: |
| vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) |
| vst1.32 {d28},[r9],r3 @stores the loaded value |
| vmlal.s16 q15,d1,d13 |
| vmlal.s16 q15,d2,d14 |
| vmlal.s16 q15,d3,d15 |
| |
| vqshrn.s32 d24,q12,#6 @right shift |
| |
| vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) |
| vld1.16 {d4},[r0],r2 |
| vmlal.s16 q14,d2,d13 |
| vst1.32 {d26},[r9],r3 @stores the loaded value |
| vmlal.s16 q14,d3,d14 |
| vmlal.s16 q14,d4,d15 |
| |
| vqshrn.s32 d30,q15,#6 @right shift |
| |
| vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) |
| vld1.s16 {d5},[r0],r2 |
| vmlal.s16 q13,d3,d13 |
| vmlal.s16 q13,d4,d14 |
| vmlal.s16 q13,d5,d15 |
| |
| vqshrn.s32 d28,q14,#6 @right shift |
| |
| vst1.32 {d24},[r9] @stores the loaded value |
| vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) |
| vmlal.s16 q12,d4,d13 |
| add r9,r1,r3 @pu1_dst + dst_strd |
| vld1.s16 {d6},[r0],r2 |
| vmlal.s16 q12,d5,d14 |
| vmlal.s16 q12,d6,d15 |
| vst1.32 {d30},[r1]! @stores the loaded value |
| |
| vqshrn.s32 d26,q13,#6 @right shift |
| |
| vst1.32 {d28},[r9],r3 @stores the loaded value |
| |
| vqshrn.s32 d24,q12,#6 @right shift |
| vst1.32 {d26},[r9],r3 @stores the loaded value |
| |
| vst1.32 {d24},[r9] @stores the loaded value |
| |
| end_loops: |
| ldmfd sp!,{r4-r12,r15} @reload the registers from sp |
| |
| |
| |
| |