| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ih264_padding_neon.s |
| // * |
| // * @brief |
| // * Contains function definitions padding |
| // * |
| // * @author |
| // * Ittiam |
| // * |
| // * @par List of Functions: |
| // * - ih264_pad_top_av8() |
| // * - ih264_pad_left_luma_av8() |
| // * - ih264_pad_left_chroma_av8() |
| // * - ih264_pad_right_luma_av8() |
| // * - ih264_pad_right_chroma_av8() |
| // * |
| // * @remarks |
| // * None |
| // * |
| // ******************************************************************************* |
| //*/ |
| |
| .text |
| .p2align 2 |
| .include "ih264_neon_macros.s" |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief pad at the top of a 2d array |
| //* |
| //* @par Description: |
| //* The top row of a 2d array is replicated for pad_size times at the top |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @param[in] pad_size |
| //* integer -padding size of the array |
| //* |
| //* @returns none |
| //* |
| //* @remarks none |
| //* |
| //******************************************************************************* |
| //*/ |
| //void ih264_pad_top(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // WORD32 wd, |
| // WORD32 pad_size) |
| //**************Variables Vs Registers************************* |
| // x0 => *pu1_src |
| // x1 => src_strd |
| // x2 => wd |
| // x3 => pad_size |
| |
| .global ih264_pad_top_av8 |
| |
| ih264_pad_top_av8: |
| |
| // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| sub x5, x0, x1 |
| sub x20, x1, #0 |
| neg x6, x20 |
| |
| loop_neon_memcpy_mul_16: |
| // Load 16 bytes |
| ld1 {v0.8b, v1.8b}, [x0], #16 |
| mov x4, x5 |
| mov x7, x3 |
| add x5, x5, #16 |
| |
| loop_neon_pad_top: |
| st1 {v0.8b, v1.8b}, [x4], x6 |
| subs x7, x7, #1 |
| bne loop_neon_pad_top |
| |
| subs x2, x2, #16 |
| bne loop_neon_memcpy_mul_16 |
| |
| // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* Padding (luma block) at the left of a 2d array |
| //* |
| //* @par Description: |
| //* The left column of a 2d array is replicated for pad_size times at the left |
| //* |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @param[in] pad_size |
| //* integer -padding size of the array |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //#if PAD_LEFT_LUMA == C |
| //void ih264_pad_left_luma(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // WORD32 ht, |
| // WORD32 pad_size) |
| //**************Variables Vs Registers************************* |
| // x0 => *pu1_src |
| // x1 => src_strd |
| // x2 => ht |
| // x3 => pad_size |
| |
| |
| |
| .global ih264_pad_left_luma_av8 |
| |
| ih264_pad_left_luma_av8: |
| |
| // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| |
| sub x4, x0, x3 |
| sub x6, x1, #16 |
| subs x5, x3, #16 |
| bne loop_32 |
| loop_16: // /*hard coded for width=16 ,height =8,16*/ |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], x1 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v2.16b}, [x4], x1 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| dup v4.16b, w10 |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x1 // 16 bytes store |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| st1 {v6.16b}, [x4], x1 // 16 bytes store |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], x1 // 16 bytes store |
| dup v2.16b, w9 |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], x1 // 16 bytes store |
| dup v4.16b, w10 |
| dup v6.16b, w11 |
| subs x2, x2, #8 |
| st1 {v4.16b}, [x4], x1 // 16 bytes store |
| st1 {v6.16b}, [x4], x1 // 16 bytes store |
| bne loop_16 |
| b end_func |
| |
| loop_32: // /*hard coded for width=32 ,height =8,16*/ |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v0.16b}, [x4], x6 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.16b, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| dup v0.16b, w8 |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v0.16b}, [x4], x6 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.16b, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #8 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| bne loop_32 |
| |
| |
| |
| end_func: |
| // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* Padding (chroma block) at the left of a 2d array |
| //* |
| //* @par Description: |
| //* The left column of a 2d array is replicated for pad_size times at the left |
| //* |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array (each colour component) |
| //* |
| //* @param[in] pad_size |
| //* integer -padding size of the array |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //#if PAD_LEFT_CHROMA == C |
| //void ih264_pad_left_chroma(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // WORD32 ht, |
| // WORD32 pad_size) |
| //{ |
| // x0 => *pu1_src |
| // x1 => src_strd |
| // x2 => ht |
| // x3 => pad_size |
| |
| |
| |
| .global ih264_pad_left_chroma_av8 |
| |
| ih264_pad_left_chroma_av8: |
| |
| // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| sub x4, x0, x3 |
| sub x6, x1, #16 |
| |
| |
| loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.8h, w8 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 // 16 bytes store |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #4 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| |
| beq end_func_l_c ///* Branching when ht=4*/ |
| |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.8h, w8 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #4 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| beq end_func_l_c ///* Branching when ht=8*/ |
| bne loop_32_l_c |
| |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.8h, w8 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| end_func_l_c: |
| // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //* Padding (luma block) at the right of a 2d array |
| //* |
| //* @par Description: |
| //* The right column of a 2d array is replicated for pad_size times at the right |
| //* |
| //* |
| //* @param[in] pu1_src |
| //* UWORD8 pointer to the source |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @param[in] pad_size |
| //* integer -padding size of the array |
| //* |
| //* @param[in] ht |
| //* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //#if PAD_RIGHT_LUMA == C |
| //void ih264_pad_right_luma(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // WORD32 ht, |
| // WORD32 pad_size) |
| //{ |
| // WORD32 row; |
| // |
| // for(row = 0; row < ht; row++) |
| // { |
| // memset(pu1_src, *(pu1_src -1), pad_size); |
| // |
| // pu1_src += src_strd; |
| // } |
| //} |
| // |
| // x0 => *pu1_src |
| // x1 => src_strd |
| // x2 => ht |
| // x3 => pad_size |
| |
| |
| |
| .global ih264_pad_right_luma_av8 |
| |
| ih264_pad_right_luma_av8: |
| |
| // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| mov x4, x0 |
| sub x6, x1, #16 |
| sub x0, x0, #1 |
| subs x5, x3, #16 |
| bne loop_32 |
| loop_16_r: // /*hard coded for width=16 ,height =8,16*/ |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], x1 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v2.16b}, [x4], x1 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| dup v4.16b, w10 |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x1 // 16 bytes store |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| st1 {v6.16b}, [x4], x1 // 16 bytes store |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], x1 // 16 bytes store |
| dup v2.16b, w9 |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], x1 // 16 bytes store |
| dup v4.16b, w10 |
| dup v6.16b, w11 |
| subs x2, x2, #8 |
| st1 {v4.16b}, [x4], x1 // 16 bytes store |
| st1 {v6.16b}, [x4], x1 // 16 bytes store |
| bne loop_16_r |
| b end_func_r |
| |
| loop_32_r: // /*hard coded for width=32 ,height =8,16*/ |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v0.16b}, [x4], x6 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.16b, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| ldrb w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| ldrb w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.16b, w8 |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| ldrb w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.16b, w9 |
| st1 {v0.16b}, [x4], x6 // 16 bytes store |
| ldrb w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.16b, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.16b, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #8 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| bne loop_32_r |
| |
| |
| |
| end_func_r: |
| // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| ///** |
| //******************************************************************************* |
| //* |
| //* @brief |
| //;* Padding (chroma block) at the right of a 2d array |
| //* |
| //* @par Description: |
| //* The right column of a 2d array is replicated for pad_size times at the right |
| //* |
| //* |
| //* @param[in] pu1_src |
| //;* UWORD8 pointer to the source |
| //* |
| //* @param[in] src_strd |
| //* integer source stride |
| //* |
| //* @param[in] ht |
| //;* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array (each colour component) |
| //* |
| //* @param[in] pad_size |
| //* integer -padding size of the array |
| //* |
| //* @param[in] ht |
| //;* integer height of the array |
| //* |
| //* @param[in] wd |
| //* integer width of the array |
| //* |
| //* @returns |
| //* |
| //* @remarks |
| //* None |
| //* |
| //******************************************************************************* |
| //*/ |
| //#if PAD_RIGHT_CHROMA == C |
| //void ih264_pad_right_chroma(UWORD8 *pu1_src, |
| // WORD32 src_strd, |
| // WORD32 ht, |
| // WORD32 pad_size) |
| // x0 => *pu1_src |
| // x1 => src_strd |
| // x2 => ht |
| // x3 => pad_size |
| |
| |
| |
| .global ih264_pad_right_chroma_av8 |
| |
| ih264_pad_right_chroma_av8: |
| |
| // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments |
| push_v_regs |
| stp x19, x20, [sp, #-16]! |
| |
| mov x4, x0 |
| sub x6, x1, #16 |
| sub x0, x0, #2 |
| loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| dup v0.8h, w8 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #4 |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| beq end_func_r_c ///* Branching when ht=4*/ |
| |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| dup v0.8h, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 // 16 bytes store |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| subs x2, x2, #4 |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| beq end_func_r_c ///* Branching when ht=8*/ |
| bne loop_32_r_c |
| ldrh w8, [x0] |
| add x0, x0, x1 |
| sxtw x8, w8 |
| dup v0.8h, w8 |
| ldrh w9, [x0] |
| add x0, x0, x1 |
| sxtw x9, w9 |
| ldrh w10, [x0] |
| add x0, x0, x1 |
| sxtw x10, w10 |
| st1 {v0.16b}, [x4], #16 // 16 bytes store |
| dup v2.8h, w9 |
| st1 {v0.16b}, [x4], x6 // 16 bytes store |
| ldrh w11, [x0] |
| add x0, x0, x1 |
| sxtw x11, w11 |
| st1 {v2.16b}, [x4], #16 // 16 bytes store |
| dup v4.8h, w10 |
| st1 {v2.16b}, [x4], x6 // 16 bytes store |
| st1 {v4.16b}, [x4], #16 // 16 bytes store |
| dup v6.8h, w11 |
| st1 {v4.16b}, [x4], x6 // 16 bytes store |
| st1 {v6.16b}, [x4], #16 // 16 bytes store |
| st1 {v6.16b}, [x4], x6 // 16 bytes store |
| |
| end_func_r_c: |
| // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP |
| ldp x19, x20, [sp], #16 |
| pop_v_regs |
| ret |
| |
| |
| |
| |
| |
| |