common/arm/ihevc_inter_pred_luma_horz_w16out.s - platform/external/libhevc - Git at Google

 @/*****************************************************************************
 @*
 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 @*
 @* Licensed under the Apache License, Version 2.0 (the "License");
 @* you may not use this file except in compliance with the License.
 @* You may obtain a copy of the License at:
 @*
 @* http://www.apache.org/licenses/LICENSE-2.0
 @*
 @* Unless required by applicable law or agreed to in writing, software
 @* distributed under the License is distributed on an "AS IS" BASIS,
 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 @* See the License for the specific language governing permissions and
 @* limitations under the License.
 @*
 @*****************************************************************************/
 @/**

 @/**
 @******************************************************************************
 @* @file
 @*  ihevc_inter_pred_luma_horz_w16out.s
 @*
 @* @brief
 @*  contains function definitions for inter prediction  interpolation.
 @* functions are coded using neon  intrinsics and can be compiled using

 @* rvct
 @*
 @* @author
 @*  parthiban v
 @*
 @* @par list of functions:
 @*
 @*  - ihevc_inter_pred_luma_horz_w16out()
 @*
 @* @remarks
 @*  none
 @*
 @*******************************************************************************
 @*/
 @/**
 @*******************************************************************************
 @*
 @* @brief
 @*   interprediction luma filter for horizontal 16bit output
 @*
 @* @par description:
 @*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
 @*     to the elements pointed by 'pu1_src' and  writes to the location pointed
 @*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
 @*     as an input for vertical filtering or weighted  prediction   assumptions :
 @*     the function is optimized considering the fact width is  multiple of 4 or
 @*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
 @*     is optimized further.
 @*
 @* @param[in] pu1_src
 @*  uword8 pointer to the source
 @*
 @* @param[out] pi2_dst
 @*  word16 pointer to the destination
 @*
 @* @param[in] src_strd
 @*  integer source stride
 @*
 @* @param[in] dst_strd
 @*  integer destination stride
 @*
 @* @param[in] pi1_coeff
 @*  word8 pointer to the filter coefficients
 @*
 @* @param[in] ht
 @*  integer height of the array
 @*
 @* @param[in] wd
 @*  integer width of the array
 @*
 @* @returns
 @*
 @* @remarks
 @*  none
 @*
 @*******************************************************************************
 @*/

 @void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
 @                                word16 *pi2_dst,
 @                                word32 src_strd,
 @                                word32 dst_strd,
 @                                word8 *pi1_coeff,
 @                                word32 ht,
 @                                word32 wd


 @r0 - free
 @r1 - dst_ptr
 @r2 - src_strd
 @r3 - dst_strd
 @r4 - src_ptr2
 @r5 - inner loop counter
 @r6 - dst_ptr2
 @r7 - free
 @r8 - dst_strd2
 @r9 - src_strd1
 @r10 - wd
 @r11 - #1
 @r12 - src_ptr1
 @r14 - loop_counter

 .equ    coeff_offset,   104
 .equ    ht_offset,      108
 .equ    wd_offset,      112

 .text
 .align 4
 .syntax unified


 .globl ihevc_inter_pred_luma_horz_w16out_a9q

 .type ihevc_inter_pred_luma_horz_w16out_a9q, %function

 ihevc_inter_pred_luma_horz_w16out_a9q:

     bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     vpush       {d8 - d15}
     ldr         r4,[sp,#coeff_offset]                 @loads pi1_coeff
     ldr         r7,[sp,#ht_offset]                 @loads ht


     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
     sub         r14,r7,#0                   @checks for ht == 0
     vabs.s8     d2,d0                       @vabs_s8(coeff)
     mov         r11,#1
     ldr         r10,[sp,#wd_offset]                @loads wd
     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
     sub         r12,r0,#3                   @pu1_src - 3
     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
     rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
     rsb         r8,r10,r3                   @dst_strd - wd
     vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)

     vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
     and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
     vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
     sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
     vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)

     cmp         r7,#1
     beq         odd_height_decision

 even_height_decision:
     mov         r7,r1
     cmp         r10,#4
     ble         outer_loop_4

     cmp         r10,#24
     moveq       r10,#16
     addeq       r8,#8
     addeq       r9,#8

     cmp         r10,#16
     bge         outer_loop_16_branch

     cmp         r10,#12
     addeq       r8,#4
     addeq       r9,#4
 outer_loop_8_branch:
     b           outer_loop_8

 outer_loop_16_branch:
     b           outer_loop_16


 odd_height_decision:
     cmp         r10,#24
     beq         outer_loop_8_branch
     cmp         r10,#12
     beq         outer_loop_4
     b           even_height_decision

 outer_loop4_residual:
     sub         r12,r0,#3                   @pu1_src - 3
     mov         r1,r7
     add         r1,#16
     mov         r10,#4
     add         r12,#8
     mov         r14,#16
     add         r8,#4
     add         r9,#4

 outer_loop_4:
     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
     add         r4,r12,r2                   @pu1_src + src_strd

     subs        r5,r10,#0                   @checks wd
     ble         end_inner_loop_4

 inner_loop_4:
     vld1.u32    {d0},[r12],r11              @vector load pu1_src
     vld1.u32    {d1},[r12],r11
     vld1.u32    {d2},[r12],r11
     vld1.u32    {d3},[r12],r11
     vld1.u32    {d4},[r12],r11
     vld1.u32    {d5},[r12],r11
     vld1.u32    {d6},[r12],r11
     vld1.u32    {d7},[r12],r11
     @add       r12,r12,#4                      @increment the input pointer
     sub         r12,r12,#4
     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
     @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]

     @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
     @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
     @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
     vld1.u32    {d13},[r4],r11
     vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
     vld1.u32    {d14},[r4],r11
     vzip.32     d1,d13
     vld1.u32    {d15},[r4],r11
     vzip.32     d2,d14
     vld1.u32    {d16},[r4],r11
     vzip.32     d3,d15
     vld1.u32    {d17},[r4],r11
     vzip.32     d4,d16
     vld1.u32    {d18},[r4],r11
     vzip.32     d5,d17
     vld1.u32    {d19},[r4],r11
     sub         r4,r4,#4
     @ add       r4,r4,#4                        @increment the input pointer
     @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
     @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
     @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
     @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]


     vzip.32     d6,d18
     vzip.32     d7,d19

     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
     vmlsl.u8    q4,d0,d24
     vmlsl.u8    q4,d2,d26
     vmlal.u8    q4,d3,d27
     vmlal.u8    q4,d4,d28
     vmlsl.u8    q4,d5,d29
     vmlal.u8    q4,d6,d30
     vmlsl.u8    q4,d7,d31

     @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
     vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
     vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
     subs        r5,r5,#4                    @decrement the wd by 4
     bgt         inner_loop_4

 end_inner_loop_4:
     subs        r14,r14,#2                  @decrement the ht by 4
     add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
     add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
     bgt         outer_loop_4


 height_residue_4:

     ldr         r7,[sp,#ht_offset]                 @loads ht
     and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
     cmp         r7,#0
     beq         end_loops

 outer_loop_height_residue_4:


     subs        r5,r10,#0                   @checks wd
     ble         end_inner_loop_height_residue_4

 inner_loop_height_residue_4:
     vld1.u32    {d0},[r12],r11              @vector load pu1_src
     vld1.u32    {d1},[r12],r11


     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]


     @add        r12,r12,#4                      @increment the input pointer
     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
     @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
     vld1.u32    {d2},[r12],r11
     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
     vld1.u32    {d3},[r12],r11
     vmlsl.u8    q4,d0,d24
     vld1.u32    {d4},[r12],r11
     vmlsl.u8    q4,d2,d26
     vld1.u32    {d5},[r12],r11
     vmlal.u8    q4,d3,d27
     vld1.u32    {d6},[r12],r11
     vmlal.u8    q4,d4,d28
     vld1.u32    {d7},[r12],r11
     vmlsl.u8    q4,d5,d29
     sub         r12,r12,#4
     vmlal.u8    q4,d6,d30
     vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
     subs        r5,r5,#4                    @decrement the wd by 4
     vst1.64     {d8},[r1]!
     bgt         inner_loop_height_residue_4

 end_inner_loop_height_residue_4:
     subs        r7,r7,#1                    @decrement the ht by 4
     rsb         r9,r10,r2
     add         r12,r12,r9                  @increment the input pointer src_strd-wd
     add         r1,r1,r8                    @increment the output pointer dst_strd-wd
     bgt         outer_loop_height_residue_4
     vpop        {d8 - d15}
     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp

 outer_loop8_residual:
     sub         r12,r0,#3                   @pu1_src - 3
     mov         r1,r7
     mov         r14,#32
     add         r1,#32
     add         r12,#16
     mov         r10,#8
     add         r8,#8
     add         r9,#8

 outer_loop_8:

     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
     add         r4,r12,r2                   @pu1_src + src_strd
     subs        r5,r10,#0                   @checks wd

     ble         end_inner_loop_8

 inner_loop_8:
     vld1.u32    {d0},[r12],r11              @vector load pu1_src
     vld1.u32    {d1},[r12],r11
     vld1.u32    {d2},[r12],r11
     vld1.u32    {d3},[r12],r11


     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
     @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
     @ vext.u8   d14,d12,d13,#2

     @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
     @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
     vld1.u32    {d4},[r12],r11
     vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
     vld1.u32    {d5},[r12],r11
     vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
     vld1.u32    {d6},[r12],r11
     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
     vld1.u32    {d7},[r12],r11
     vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
     vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
     vld1.u32    {d13},[r4],r11
     vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
     vld1.u32    {d14},[r4],r11
     vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
     vld1.u32    {d15},[r4],r11
     vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
     vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd

     vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
     vld1.u32    {d17},[r4],r11
     vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
     vld1.u32    {d18},[r4],r11
     vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
     vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
     vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
     @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
     vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
     vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
     vst1.16     {q4},[r1]!                  @store the result pu1_dst
     vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
     vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@


     @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
     subs        r5,r5,#8                    @decrement the wd loop
     vst1.16     {q5},[r6]!                  @store the result pu1_dst
     cmp         r5,#4
     bgt         inner_loop_8

 end_inner_loop_8:
     subs        r14,r14,#2                  @decrement the ht loop
     add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
     add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
     bgt         outer_loop_8


     ldr         r10,[sp,#wd_offset]                @loads wd
     cmp         r10,#12

     beq         outer_loop4_residual

     ldr         r7,[sp,#ht_offset]                 @loads ht
     and         r7,r7,#1
     cmp         r7,#1
     beq         height_residue_4


     vpop        {d8 - d15}
     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp


 outer_loop_16:
     str         r0, [sp, #-4]!
     str         r7, [sp, #-4]!
     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
     add         r4,r12,r2                   @pu1_src + src_strd
     and         r0, r12, #31
     sub         r5,r10,#0                   @checks wd
     pld         [r12, r2, lsl #1]
     vld1.u32    {q0},[r12],r11              @vector load pu1_src
     pld         [r4, r2, lsl #1]
     vld1.u32    {q1},[r12],r11
     vld1.u32    {q2},[r12],r11
     vld1.u32    {q3},[r12],r11
     vld1.u32    {q6},[r12],r11
     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
     vld1.u32    {q7},[r12],r11
     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
     vld1.u32    {q8},[r12],r11
     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
     vld1.u32    {q9},[r12],r11
     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@


 inner_loop_16:


     subs        r5,r5,#16
     vmull.u8    q10,d3,d25

     add         r12,#8
     vmlsl.u8    q10,d1,d24

     vld1.u32    {q0},[r4],r11               @vector load pu1_src
     vmlal.u8    q10,d7,d27

     vld1.u32    {q1},[r4],r11
     vmlsl.u8    q10,d5,d26

     vld1.u32    {q2},[r4],r11
     vmlal.u8    q10,d13,d28

     vld1.u32    {q3},[r4],r11
     vmlal.u8    q10,d17,d30

     vld1.u32    {q6},[r4],r11
     vmlsl.u8    q10,d15,d29

     vld1.u32    {q7},[r4],r11
     vmlsl.u8    q10,d19,d31

     vld1.u32    {q8},[r4],r11
     vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@

     vld1.u32    {q9},[r4],r11
     vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@

     add         r4,#8
     vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
     pld         [r12, r2, lsl #2]
     pld         [r4, r2, lsl #2]
     vst1.8      {q4},[r1]!                  @store the result pu1_dst
     vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@

     addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
     vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@

     addeq       r4,r12,r2                   @pu1_src + src_strd
     vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@

 @   and         r7, r12, #31
     vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@

     subeq       r14,r14,#2
     vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@

     @cmp            r7, r0
     vmull.u8    q11,d3,d25

 @   pld     [r12, r2, lsl #2]
     vmlsl.u8    q11,d1,d24

     vst1.16     {q10},[r1]!
     vmlal.u8    q11,d7,d27

 @   pld     [r4, r2, lsl #2]
     vmlsl.u8    q11,d5,d26

 @   mov         r0, r7
     vmlal.u8    q11,d13,d28

     cmp         r14,#0
     vmlal.u8    q11,d17,d30

     vst1.16     {q5},[r6]!
     vmlsl.u8    q11,d15,d29

     vmlsl.u8    q11,d19,d31

     beq         epilog_16

     vld1.u32    {q0},[r12],r11              @vector load pu1_src
     vld1.u32    {q1},[r12],r11
     vld1.u32    {q2},[r12],r11
     vld1.u32    {q3},[r12],r11
     vld1.u32    {q6},[r12],r11
     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
     vld1.u32    {q7},[r12],r11
     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
     vld1.u32    {q8},[r12],r11
     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
     vld1.u32    {q9},[r12],r11
     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
     cmp         r5,#0
     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
     moveq       r5,r10
     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
     vst1.8      {q11},[r6]!                 @store the result pu1_dst
     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
     addeq       r1,r6,r8,lsl #1
     addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
     b           inner_loop_16


 epilog_16:
 @   vqrshrun.s16 d11,q11,#6
     vst1.8      {q11},[r6]!                 @store the result pu1_dst

     ldr         r7, [sp], #4
     ldr         r0, [sp], #4
     ldr         r10,[sp,#wd_offset]
     cmp         r10,#24
     beq         outer_loop8_residual
     add         r1,r6,r8,lsl #1
     ldr         r7,[sp,#ht_offset]                 @loads ht
     and         r7,r7,#1
     cmp         r7,#1
     beq         height_residue_4

 end_loops:
     vpop        {d8 - d15}
     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
	@/*****************************************************************************
	@*
	@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	@*
	@* Licensed under the Apache License, Version 2.0 (the "License");
	@* you may not use this file except in compliance with the License.
	@* You may obtain a copy of the License at:
	@*
	@* http://www.apache.org/licenses/LICENSE-2.0
	@*
	@* Unless required by applicable law or agreed to in writing, software
	@* distributed under the License is distributed on an "AS IS" BASIS,
	@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	@* See the License for the specific language governing permissions and
	@* limitations under the License.
	@*
	@*****************************************************************************/
	@/**

	@/**
	@******************************************************************************
	@* @file
	@* ihevc_inter_pred_luma_horz_w16out.s
	@*
	@* @brief
	@* contains function definitions for inter prediction interpolation.
	@* functions are coded using neon intrinsics and can be compiled using

	@* rvct
	@*
	@* @author
	@* parthiban v
	@*
	@* @par list of functions:
	@*
	@* - ihevc_inter_pred_luma_horz_w16out()
	@*
	@* @remarks
	@* none
	@*
	@*******************************************************************************
	@*/
	@/**
	@*******************************************************************************
	@*
	@* @brief
	@* interprediction luma filter for horizontal 16bit output
	@*
	@* @par description:
	@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
	@* to the elements pointed by 'pu1_src' and writes to the location pointed
	@* by 'pu1_dst' no downshifting or clipping is done and the output is used
	@* as an input for vertical filtering or weighted prediction assumptions :
	@* the function is optimized considering the fact width is multiple of 4 or
	@* 8. if width is multiple of 4 then height should be multiple of 2, width 8
	@* is optimized further.
	@*
	@* @param[in] pu1_src
	@* uword8 pointer to the source
	@*
	@* @param[out] pi2_dst
	@* word16 pointer to the destination
	@*
	@* @param[in] src_strd
	@* integer source stride
	@*
	@* @param[in] dst_strd
	@* integer destination stride
	@*
	@* @param[in] pi1_coeff
	@* word8 pointer to the filter coefficients
	@*
	@* @param[in] ht
	@* integer height of the array
	@*
	@* @param[in] wd
	@* integer width of the array
	@*
	@* @returns
	@*
	@* @remarks
	@* none
	@*
	@*******************************************************************************
	@*/

	@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
	@ word16 *pi2_dst,
	@ word32 src_strd,
	@ word32 dst_strd,
	@ word8 *pi1_coeff,
	@ word32 ht,
	@ word32 wd


	@r0 - free
	@r1 - dst_ptr
	@r2 - src_strd
	@r3 - dst_strd
	@r4 - src_ptr2
	@r5 - inner loop counter
	@r6 - dst_ptr2
	@r7 - free
	@r8 - dst_strd2
	@r9 - src_strd1
	@r10 - wd
	@r11 - #1
	@r12 - src_ptr1
	@r14 - loop_counter

	.equ coeff_offset, 104
	.equ ht_offset, 108
	.equ wd_offset, 112

	.text
	.align 4
	.syntax unified




	.globl ihevc_inter_pred_luma_horz_w16out_a9q

	.type ihevc_inter_pred_luma_horz_w16out_a9q, %function

	ihevc_inter_pred_luma_horz_w16out_a9q:

	bic r14, #1 @ clearing bit[0], so that it goes back to mode
	stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
	vpush {d8 - d15}
	ldr r4,[sp,#coeff_offset] @loads pi1_coeff
	ldr r7,[sp,#ht_offset] @loads ht


	vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
	sub r14,r7,#0 @checks for ht == 0
	vabs.s8 d2,d0 @vabs_s8(coeff)
	mov r11,#1
	ldr r10,[sp,#wd_offset] @loads wd
	vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
	sub r12,r0,#3 @pu1_src - 3
	vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
	add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
	vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
	rsb r9,r10,r2,lsl #1 @2*src_strd - wd
	vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
	rsb r8,r10,r3 @dst_strd - wd
	vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)

	vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
	and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1)
	vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
	sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside)
	vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)

	cmp r7,#1
	beq odd_height_decision

	even_height_decision:
	mov r7,r1
	cmp r10,#4
	ble outer_loop_4

	cmp r10,#24
	moveq r10,#16
	addeq r8,#8
	addeq r9,#8

	cmp r10,#16
	bge outer_loop_16_branch

	cmp r10,#12
	addeq r8,#4
	addeq r9,#4
	outer_loop_8_branch:
	b outer_loop_8

	outer_loop_16_branch:
	b outer_loop_16


	odd_height_decision:
	cmp r10,#24
	beq outer_loop_8_branch
	cmp r10,#12
	beq outer_loop_4
	b even_height_decision

	outer_loop4_residual:
	sub r12,r0,#3 @pu1_src - 3
	mov r1,r7
	add r1,#16
	mov r10,#4
	add r12,#8
	mov r14,#16
	add r8,#4
	add r9,#4

	outer_loop_4:
	add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
	add r4,r12,r2 @pu1_src + src_strd

	subs r5,r10,#0 @checks wd
	ble end_inner_loop_4

	inner_loop_4:
	vld1.u32 {d0},[r12],r11 @vector load pu1_src
	vld1.u32 {d1},[r12],r11
	vld1.u32 {d2},[r12],r11
	vld1.u32 {d3},[r12],r11
	vld1.u32 {d4},[r12],r11
	vld1.u32 {d5},[r12],r11
	vld1.u32 {d6},[r12],r11
	vld1.u32 {d7},[r12],r11
	@add r12,r12,#4 @increment the input pointer
	sub r12,r12,#4
	@vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
	@vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
	@vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]

	@vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
	@vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
	@vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
	@vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
	vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
	vld1.u32 {d13},[r4],r11
	vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
	vld1.u32 {d14},[r4],r11
	vzip.32 d1,d13
	vld1.u32 {d15},[r4],r11
	vzip.32 d2,d14
	vld1.u32 {d16},[r4],r11
	vzip.32 d3,d15
	vld1.u32 {d17},[r4],r11
	vzip.32 d4,d16
	vld1.u32 {d18},[r4],r11
	vzip.32 d5,d17
	vld1.u32 {d19},[r4],r11
	sub r4,r4,#4
	@ add r4,r4,#4 @increment the input pointer
	@ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
	@ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
	@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
	@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
	@ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
	@ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
	@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]







	vzip.32 d6,d18
	vzip.32 d7,d19

	vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
	vmlsl.u8 q4,d0,d24
	vmlsl.u8 q4,d2,d26
	vmlal.u8 q4,d3,d27
	vmlal.u8 q4,d4,d28
	vmlsl.u8 q4,d5,d29
	vmlal.u8 q4,d6,d30
	vmlsl.u8 q4,d7,d31

	@ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
	vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register
	vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register
	subs r5,r5,#4 @decrement the wd by 4
	bgt inner_loop_4

	end_inner_loop_4:
	subs r14,r14,#2 @decrement the ht by 4
	add r12,r12,r9 @increment the input pointer 2*src_strd-wd
	add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd
	bgt outer_loop_4


	height_residue_4:

	ldr r7,[sp,#ht_offset] @loads ht
	and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
	cmp r7,#0
	beq end_loops

	outer_loop_height_residue_4:


	subs r5,r10,#0 @checks wd
	ble end_inner_loop_height_residue_4

	inner_loop_height_residue_4:
	vld1.u32 {d0},[r12],r11 @vector load pu1_src
	vld1.u32 {d1},[r12],r11






	@ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
	@ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
	@ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]



	@add r12,r12,#4 @increment the input pointer
	@ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
	@ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
	@ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
	@ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
	vld1.u32 {d2},[r12],r11
	vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
	vld1.u32 {d3},[r12],r11
	vmlsl.u8 q4,d0,d24
	vld1.u32 {d4},[r12],r11
	vmlsl.u8 q4,d2,d26
	vld1.u32 {d5},[r12],r11
	vmlal.u8 q4,d3,d27
	vld1.u32 {d6},[r12],r11
	vmlal.u8 q4,d4,d28
	vld1.u32 {d7},[r12],r11
	vmlsl.u8 q4,d5,d29
	sub r12,r12,#4
	vmlal.u8 q4,d6,d30
	vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register
	subs r5,r5,#4 @decrement the wd by 4
	vst1.64 {d8},[r1]!
	bgt inner_loop_height_residue_4

	end_inner_loop_height_residue_4:
	subs r7,r7,#1 @decrement the ht by 4
	rsb r9,r10,r2
	add r12,r12,r9 @increment the input pointer src_strd-wd
	add r1,r1,r8 @increment the output pointer dst_strd-wd
	bgt outer_loop_height_residue_4
	vpop {d8 - d15}
	ldmfd sp!,{r4-r12,r15} @reload the registers from sp

	outer_loop8_residual:
	sub r12,r0,#3 @pu1_src - 3
	mov r1,r7
	mov r14,#32
	add r1,#32
	add r12,#16
	mov r10,#8
	add r8,#8
	add r9,#8

	outer_loop_8:

	add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
	add r4,r12,r2 @pu1_src + src_strd
	subs r5,r10,#0 @checks wd

	ble end_inner_loop_8

	inner_loop_8:
	vld1.u32 {d0},[r12],r11 @vector load pu1_src
	vld1.u32 {d1},[r12],r11
	vld1.u32 {d2},[r12],r11
	vld1.u32 {d3},[r12],r11





	@ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
	@ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
	@ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
	@ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
	@ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6]
	@ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
	@ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
	@ vext.u8 d14,d12,d13,#2

	@vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
	@ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
	@ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
	@vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
	@vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
	@vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
	vld1.u32 {d4},[r12],r11
	vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
	vld1.u32 {d5},[r12],r11
	vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
	vld1.u32 {d6},[r12],r11
	vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
	vld1.u32 {d7},[r12],r11
	vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
	vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
	vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
	vld1.u32 {d13},[r4],r11
	vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
	vld1.u32 {d14},[r4],r11
	vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
	vld1.u32 {d15},[r4],r11
	vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
	vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd

	vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
	vld1.u32 {d17},[r4],r11
	vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
	vld1.u32 {d18},[r4],r11
	vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
	vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd
	vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
	@ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1
	vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
	vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
	vst1.16 {q4},[r1]! @store the result pu1_dst
	vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
	vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@



	@ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2
	subs r5,r5,#8 @decrement the wd loop
	vst1.16 {q5},[r6]! @store the result pu1_dst
	cmp r5,#4
	bgt inner_loop_8

	end_inner_loop_8:
	subs r14,r14,#2 @decrement the ht loop
	add r12,r12,r9 @increment the src pointer by 2*src_strd-wd
	add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd
	bgt outer_loop_8





	ldr r10,[sp,#wd_offset] @loads wd
	cmp r10,#12

	beq outer_loop4_residual

	ldr r7,[sp,#ht_offset] @loads ht
	and r7,r7,#1
	cmp r7,#1
	beq height_residue_4


	vpop {d8 - d15}
	ldmfd sp!,{r4-r12,r15} @reload the registers from sp





	outer_loop_16:
	str r0, [sp, #-4]!
	str r7, [sp, #-4]!
	add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
	add r4,r12,r2 @pu1_src + src_strd
	and r0, r12, #31
	sub r5,r10,#0 @checks wd
	pld [r12, r2, lsl #1]
	vld1.u32 {q0},[r12],r11 @vector load pu1_src
	pld [r4, r2, lsl #1]
	vld1.u32 {q1},[r12],r11
	vld1.u32 {q2},[r12],r11
	vld1.u32 {q3},[r12],r11
	vld1.u32 {q6},[r12],r11
	vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
	vld1.u32 {q7},[r12],r11
	vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
	vld1.u32 {q8},[r12],r11
	vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
	vld1.u32 {q9},[r12],r11
	vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
	vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
	vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
	vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
	vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@


	inner_loop_16:


	subs r5,r5,#16
	vmull.u8 q10,d3,d25

	add r12,#8
	vmlsl.u8 q10,d1,d24

	vld1.u32 {q0},[r4],r11 @vector load pu1_src
	vmlal.u8 q10,d7,d27

	vld1.u32 {q1},[r4],r11
	vmlsl.u8 q10,d5,d26

	vld1.u32 {q2},[r4],r11
	vmlal.u8 q10,d13,d28

	vld1.u32 {q3},[r4],r11
	vmlal.u8 q10,d17,d30

	vld1.u32 {q6},[r4],r11
	vmlsl.u8 q10,d15,d29

	vld1.u32 {q7},[r4],r11
	vmlsl.u8 q10,d19,d31

	vld1.u32 {q8},[r4],r11
	vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@

	vld1.u32 {q9},[r4],r11
	vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@

	add r4,#8
	vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
	pld [r12, r2, lsl #2]
	pld [r4, r2, lsl #2]
	vst1.8 {q4},[r1]! @store the result pu1_dst
	vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@

	addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd
	vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@

	addeq r4,r12,r2 @pu1_src + src_strd
	vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@

	@ and r7, r12, #31
	vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@

	subeq r14,r14,#2
	vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@

	@cmp r7, r0
	vmull.u8 q11,d3,d25

	@ pld [r12, r2, lsl #2]
	vmlsl.u8 q11,d1,d24

	vst1.16 {q10},[r1]!
	vmlal.u8 q11,d7,d27

	@ pld [r4, r2, lsl #2]
	vmlsl.u8 q11,d5,d26

	@ mov r0, r7
	vmlal.u8 q11,d13,d28

	cmp r14,#0
	vmlal.u8 q11,d17,d30

	vst1.16 {q5},[r6]!
	vmlsl.u8 q11,d15,d29

	vmlsl.u8 q11,d19,d31

	beq epilog_16

	vld1.u32 {q0},[r12],r11 @vector load pu1_src
	vld1.u32 {q1},[r12],r11
	vld1.u32 {q2},[r12],r11
	vld1.u32 {q3},[r12],r11
	vld1.u32 {q6},[r12],r11
	vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
	vld1.u32 {q7},[r12],r11
	vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
	vld1.u32 {q8},[r12],r11
	vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
	vld1.u32 {q9},[r12],r11
	vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
	vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
	cmp r5,#0
	vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
	moveq r5,r10
	vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
	vst1.8 {q11},[r6]! @store the result pu1_dst
	vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
	addeq r1,r6,r8,lsl #1
	addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd
	b inner_loop_16


	epilog_16:
	@ vqrshrun.s16 d11,q11,#6
	vst1.8 {q11},[r6]! @store the result pu1_dst

	ldr r7, [sp], #4
	ldr r0, [sp], #4
	ldr r10,[sp,#wd_offset]
	cmp r10,#24
	beq outer_loop8_residual
	add r1,r6,r8,lsl #1
	ldr r7,[sp,#ht_offset] @loads ht
	and r7,r7,#1
	cmp r7,#1
	beq height_residue_4

	end_loops:
	vpop {d8 - d15}
	ldmfd sp!,{r4-r12,r15} @reload the registers from sp