| @/***************************************************************************** |
| @* |
| @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| @* |
| @* Licensed under the Apache License, Version 2.0 (the "License"); |
| @* you may not use this file except in compliance with the License. |
| @* You may obtain a copy of the License at: |
| @* |
| @* http://www.apache.org/licenses/LICENSE-2.0 |
| @* |
| @* Unless required by applicable law or agreed to in writing, software |
| @* distributed under the License is distributed on an "AS IS" BASIS, |
| @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| @* See the License for the specific language governing permissions and |
| @* limitations under the License. |
| @* |
| @*****************************************************************************/ |
| @/** |
| @******************************************************************************* |
| @* @file |
| @* ihevc_inter_pred_chroma_copy_w16out_neon.s |
| @* |
| @* @brief |
| @* contains function definitions for inter prediction interpolation. |
| @* functions are coded using neon intrinsics and can be compiled using |
| |
| @* rvct |
| @* |
| @* @author |
| @* yogeswaran rs |
| @* |
| @* @par list of functions: |
| @* |
| @* |
| @* @remarks |
| @* none |
| @* |
| @******************************************************************************* |
| @*/ |
| @/** |
| @******************************************************************************* |
| @* |
| @* @brief |
| @* chroma interprediction filter for copy |
| @* |
| @* @par description: |
| @* copies the array of width 'wd' and height 'ht' from the location pointed |
| @* by 'src' to the location pointed by 'dst' |
| @* |
| @* @param[in] pu1_src |
| @* uword8 pointer to the source |
| @* |
| @* @param[out] pu1_dst |
| @* uword8 pointer to the destination |
| @* |
| @* @param[in] src_strd |
| @* integer source stride |
| @* |
| @* @param[in] dst_strd |
| @* integer destination stride |
| @* |
| @* @param[in] pi1_coeff |
| @* word8 pointer to the filter coefficients |
| @* |
| @* @param[in] ht |
| @* integer height of the array |
| @* |
| @* @param[in] wd |
| @* integer width of the array |
| @* |
| @* @returns |
| @* |
| @* @remarks |
| @* none |
| @* |
| @******************************************************************************* |
| @*/ |
| |
| @void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, |
| @ word16 *pi2_dst, |
| @ word32 src_strd, |
| @ word32 dst_strd, |
| @ word8 *pi1_coeff, |
| @ word32 ht, |
| @ word32 wd) |
| @**************variables vs registers***************************************** |
| @r0 => *pu1_src |
| @r1 => *pi2_dst |
| @r2 => src_strd |
| @r3 => dst_strd |
| @r4 => *pi1_coeff |
| @r5 => ht |
| @r6 => wd |
| |
| .text |
| .align 4 |
| |
| |
| |
| |
| .globl ihevc_inter_pred_chroma_copy_w16out_a9q |
| |
| .type ihevc_inter_pred_chroma_copy_w16out_a9q, %function |
| |
| ihevc_inter_pred_chroma_copy_w16out_a9q: |
| |
| stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments |
| ldr r12,[sp,#48] @loads wd |
| lsl r12,r12,#1 @2*wd |
| ldr r7,[sp,#44] @loads ht |
| cmp r7,#0 @ht condition(ht == 0) |
| ble end_loops @loop |
| and r8,r7,#3 @check ht for mul of 2 |
| sub r9,r7,r8 @check the rounded height value |
| and r11,r7,#6 |
| cmp r11,#6 |
| beq loop_ht_6 |
| tst r12,#7 @conditional check for wd (multiples) |
| beq core_loop_wd_8 |
| |
| loop_ht_6: |
| sub r11,r12,#4 |
| lsls r6,r3,#1 |
| cmp r9,#0 |
| beq outer_loop_wd_4_ht_2 |
| |
| outer_loop_wd_4: |
| subs r4,r12,#0 @wd conditional subtract |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4: |
| vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) |
| add r5,r0,r2 @pu1_src +src_strd |
| vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| add r10,r1,r6 |
| subs r4,r4,#4 @wd - 4 |
| vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) |
| vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) |
| add r0,r0,#4 @pu1_src += 4 |
| vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| add r1,r1,#8 |
| vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) |
| vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) |
| vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) |
| vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) |
| vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) |
| vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| bgt inner_loop_wd_4 |
| |
| end_inner_loop_wd_4: |
| subs r9,r9,#4 @ht - 4 |
| sub r0,r5,r11 |
| sub r1,r10,r11,lsl #1 |
| bgt outer_loop_wd_4 |
| cmp r8,#0 |
| bgt outer_loop_wd_4_ht_2 |
| |
| |
| end_loops: |
| ldmfd sp!,{r4-r12,r15} @reload the registers from sp |
| |
| |
| outer_loop_wd_4_ht_2: |
| subs r4,r12,#0 @wd conditional subtract |
| ble end_inner_loop_wd_4 |
| |
| inner_loop_wd_4_ht_2: |
| vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) |
| add r5,r0,r2 @pu1_src +src_strd |
| vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| add r10,r1,r6 |
| subs r4,r4,#4 @wd - 4 |
| vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) |
| vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) |
| add r0,r0,#4 @pu1_src += 4 |
| vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| add r1,r1,#8 |
| vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) |
| vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) |
| vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) |
| bgt inner_loop_wd_4_ht_2 |
| b end_loops |
| |
| |
| core_loop_wd_8: |
| @sub r11,r12,#8 |
| lsls r5,r3,#1 |
| rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width |
| rsb r8,r12,r2,lsl #2 @r2->src_strd |
| mov r4,r12, lsr #3 @ divide by 8 |
| mov r7,r9 |
| mul r7, r4 |
| sub r4,r12,#0 @wd conditional check |
| sub r7,r7,#4 @subtract one for epilog |
| cmp r9,#0 |
| beq core_loop_wd_8_ht_2 |
| |
| prolog: |
| add r6,r0,r2 @pu1_src_tmp += src_strd |
| add r10,r1,r5 |
| vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) |
| vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) |
| vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| subs r4,r4,#8 @wd decrements by 8 |
| vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) |
| addle r0,r0,r8 |
| add r6,r0,r2 @pu1_src_tmp += src_strd |
| vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) |
| vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) |
| |
| vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) |
| addle r1,r1,r11,lsl #1 |
| suble r4,r12,#0 @wd conditional check |
| |
| subs r7,r7,#4 @ht - 4 |
| |
| blt epilog_end @jumps to epilog_end |
| beq epilog @jumps to epilog |
| |
| |
| |
| outer_loop_wd_8: |
| |
| vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) |
| |
| vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| subs r4,r4,#8 @wd decrements by 8 |
| addle r0,r0,r8 |
| |
| add r6,r0,r2 @pu1_src_tmp += src_strd |
| |
| vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) |
| vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) |
| |
| vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) |
| |
| vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) |
| |
| vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) |
| add r10,r1,r5 |
| |
| vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) |
| |
| vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) |
| |
| addle r1,r1,r11,lsl #1 |
| suble r4,r12,#0 @wd conditional check |
| |
| subs r7,r7,#4 @ht - 4 |
| bgt outer_loop_wd_8 |
| |
| epilog: |
| vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) |
| |
| vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| |
| vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| @add r6,r0,r2 @pu1_src_tmp += src_strd |
| |
| vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) |
| add r10,r1,r5 |
| vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) |
| |
| vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) |
| epilog_end: |
| vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| b end_loops |
| |
| core_loop_wd_8_ht_2: |
| add r6,r0,r2 @pu1_src_tmp += src_strd |
| add r10,r1,r5 |
| vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) |
| vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) |
| vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) |
| vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) |
| subs r12,r12,#8 @wd decrements by 8 |
| vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) |
| vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) |
| vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) |
| vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) |
| bgt core_loop_wd_8_ht_2 |
| |
| ldmfd sp!,{r4-r12,r15} @reload the registers from sp |
| |
| |
| |
| |
| |
| |