common/arm64/ihevc_weighted_pred_bi.s - platform/external/libhevc - Git at Google

 ///*****************************************************************************
 //*
 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************/
 ///**
 //*******************************************************************************
 //* //file
 //*  ihevc_weighted_pred_bi.s
 //*
 //* //brief
 //*  contains function definitions for weighted prediction used in inter
 //* prediction
 //*
 //* //author
 //*  parthiban v
 //*
 //* //par list of functions:
 //*  - ihevc_weighted_pred_bi()
 //*
 //* //remarks
 //*  none
 //*
 //*******************************************************************************
 //*/
 ///**
 //*******************************************************************************
 //*
 //* //brief
 //*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
 //* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
 //* function is optimized considering the fact width and  height are multiple
 //* of 2.
 //*
 //* //par description:
 //*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
 //* off1 + 1) << (shift - 1) ) >> shift
 //*
 //* //param[in] pi2_src1
 //*  pointer to source 1
 //*
 //* //param[in] pi2_src2
 //*  pointer to source 2
 //*
 //* //param[out] pu1_dst
 //*  pointer to destination
 //*
 //* //param[in] src_strd1
 //*  source stride 1
 //*
 //* //param[in] src_strd2
 //*  source stride 2
 //*
 //* //param[in] dst_strd
 //*  destination stride
 //*
 //* //param[in] wgt0
 //*  weight to be multiplied to source 1
 //*
 //* //param[in] off0
 //*  offset 0
 //*
 //* //param[in] wgt1
 //*  weight to be multiplied to source 2
 //*
 //* //param[in] off1
 //*  offset 1
 //*
 //* //param[in] shift
 //*  (14 bit depth) + log2_weight_denominator
 //*
 //* //param[in] lvl_shift1
 //*  added before shift and offset
 //*
 //* //param[in] lvl_shift2
 //*  added before shift and offset
 //*
 //* //param[in] ht
 //*  height of the source
 //*
 //* //param[in] wd
 //*  width of the source
 //*
 //* //returns
 //*
 //* //remarks
 //*  none
 //*
 //*******************************************************************************
 //*/

 //void ihevc_weighted_pred_bi(word16 *pi2_src1,
 //                            word16 *pi2_src2,
 //                            uword8 *pu1_dst,
 //                            word32 src_strd1,
 //                            word32 src_strd2,
 //                            word32 dst_strd,
 //                            word32 wgt0,
 //                            word32 off0,
 //                            word32 wgt1,
 //                            word32 off1,
 //                            word32 shift,
 //                            word32 lvl_shift1,
 //                            word32 lvl_shift2,
 //                            word32 ht,
 //                            word32 wd)

 //**************variables vs registers*****************************************
 //    x0 => *pi2_src1
 //    x1 => *pi2_src2
 //    x2 => *pu1_dst
 //    x3 =>  src_strd1
 //    x4 =>  src_strd2
 //    x5 =>  dst_strd
 //    x6 =>  wgt0
 //    x7 =>  off0
 //    x8 =>  wgt1
 //    x9 =>  off1
 //    x10 =>  shift
 //    x11 =>  lvl_shift1
 //    x12 =>    lvl_shift2
 //    x14 =>    ht
 //    x7    =>    wd

 .text
 .align 4

 .include "ihevc_neon_macros.s"

 .globl ihevc_weighted_pred_bi_av8

 .type ihevc_weighted_pred_bi_av8, %function

 ihevc_weighted_pred_bi_av8:

     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

     ldr         w8,[sp,#0]
     ldr         w9,[sp,#8]
     ldr         w10,[sp,#16]
     ldr         w11,[sp,#24]
     ldr         w12,[sp,#32]
     ldr         w13,[sp,#40]
     ldr         w14,[sp,#48]

     sxtw        x8,w8
     sxtw        x9,w9
     sxtw        x10,w10
     sxtw        x11,w11
     sxtw        x12,w12


     stp         x19, x20,[sp,#-16]!
     stp         x21, x22,[sp,#-16]!
     stp         x23, x24,[sp,#-16]!
     stp         x25, x26,[sp,#-16]!

     mov         x15,x4 // src_strd2 40
     mov         x16,x5 // dst_strd 44
     mov         x17,x6 // wgt0 48
     mov         x19,x7 // off0 52
     mov         x20,x8 // wgt1 56
     mov         x21,x9 // off1 60
     mov         x22,x10 // shift 64
     mov         x23,x11 // lvl_shift1 68
     mov         x24,x12 // lvl_shift2 72
     mov         x25,x13 // ht 76
     mov         x26,x14 // wd 80

     mov         x6,x17                      //load wgt0
     mov         x11,x23                     //load lvl_shift1
     mov         x12,x24                     //load lvl_shift2
     mov         v7.h[0],w6                  //moved for scalar multiplication
     mul         x4, x11 , x6                //lvl_shift1 * wgt0
     mov         x8,x20                      //load wgt1
     mov         x7,x19                      //load off0
     mov         v7.h[1],w8                  //moved for scalar multiplication
     madd        x4,x12,x8,x4                //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
     mov         x9,x21                      //load off1
     add         x5,x7,x9                    //off0 + off1
     mov         x10,x22                     //load shift
     add         x5,x5,#1                    //off0 + off1 + 1
     sub         x14,x10,#1                  //shift - 1
     mov         x7,x26                      //load wd
     lsl         x5,x5,x14                   //((off0 + off1 + 1) << (shift - 1))
     dup         v28.4s,w10                  //vmovq_n_s32(0-shift)
     add         x4,x4,x5                    //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
     dup         v30.4s,w4                   //vmovq_n_s32(tmp_lvl_shift)
     neg         v28.4s, v28.4s
     mov         x4,x15                      //load src_strd2
     lsl         x9,x7,#1
     mov         x5,x16                      //load dst_strd
     lsl         x3,x3,#1
     mov         x14,x25                     //load ht
     lsl         x4,x4,#1

     cmp         x14,#0                      //check ht == 0
     beq         end_loops                   //if equal, then end the function

 outer_loop:
     cmp         x7,#0                       //check wd == 0
     beq         end_loops                   //if equal, then end the function

 core_loop:
     add         x6,x0,x3                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
     add         x8,x1,x4                    //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
     ld1         {v0.4h},[x0],#8             //load and increment the pi2_src1
     add         x10,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
     ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
     smull       v4.4s, v0.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
     smull       v5.4s, v1.4h, v7.h[1]       //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
     add         v4.4s,  v4.4s ,  v5.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)

     ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
     smull       v6.4s, v2.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration

     ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
     add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
     smull       v19.4s, v0.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration

     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
     smull       v17.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
     sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)

     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
     add         v6.4s,  v6.4s ,  v17.4s     //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration

     sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
     smull       v16.4s, v1.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration

     add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
     //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
     add         v19.4s,  v19.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration

     sshl        v6.4s,v6.4s,v28.4s
     //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
     smull       v18.4s, v2.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
     uqxtn       v4.8b,v4.8h
     //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
     add         v19.4s,  v19.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration

     sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
     smull       v20.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration

     sshl        v19.4s,v19.4s,v28.4s
     //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
     //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

     add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
     sqxtun      v19.4h, v19.4s              //vqmovun_s32(sto_res_tmp1) iii iteration

     add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
     st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration

     uqxtn       v6.8b,v6.8h
     //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
     sshl        v18.4s,v18.4s,v28.4s
     //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
     st1         {v6.s}[0],[x10],x5          //store pu1_dst ii iteration


     //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
     uqxtn       v19.8b,v19.8h
     //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
     sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
     //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
     st1         {v19.s}[0],[x10],x5         //store pu1_dst iii iteration
     uqxtn       v18.8b,v18.8h
     //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
     subs        x7,x7,#4                    //decrement wd by 4 and check for 0
     st1         {v18.s}[0],[x10],x5         //store pu1_dst iv iteration

     bgt         core_loop                   //if greater than 0 repeat the core loop again

 end_core_loop:
     sub         x20,x9,x3,lsl #2            //2*src_strd1 - wd
     neg         x11, x20
     subs        x14,x14,#4                  //decrement the ht by 4
     sub         x20,x9,x4,lsl #2            //2*src_strd2 - wd
     neg         x12, x20
     add         x0,x0,x11                   //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
     asr         x7,x9,#1
     add         x1,x1,x12                   //pi2_src2 + 4*src_strd2 - 2*wd
     sub         x20,x7,x5,lsl #2            //2*dst_strd - wd
     neg         x10, x20
     add         x2,x2,x10                   //pu1_dst + dst_std - wd
     bgt         core_loop                   //if ht is greater than 0 goto outer_loop

 end_loops:
     // ldmfd sp!,{x4-x12,x15}              //reload the registers from sp
     ldp         x25, x26,[sp],#16
     ldp         x23, x24,[sp],#16
     ldp         x21, x22,[sp],#16
     ldp         x19, x20,[sp],#16

     ret
	///*****************************************************************************
	//*
	//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************/
	///**
	//*******************************************************************************
	//* //file
	//* ihevc_weighted_pred_bi.s
	//*
	//* //brief
	//* contains function definitions for weighted prediction used in inter
	//* prediction
	//*
	//* //author
	//* parthiban v
	//*
	//* //par list of functions:
	//* - ihevc_weighted_pred_bi()
	//*
	//* //remarks
	//* none
	//*
	//*******************************************************************************
	//*/
	///**
	//*******************************************************************************
	//*
	//* //brief
	//* does bi-weighted prediction on the arrays pointed by pi2_src1 and
	//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
	//* function is optimized considering the fact width and height are multiple
	//* of 2.
	//*
	//* //par description:
	//* dst = ( (src1 + lvl_shift1)wgt0 + (src2 + lvl_shift2)wgt1 + (off0 +
	//* off1 + 1) << (shift - 1) ) >> shift
	//*
	//* //param[in] pi2_src1
	//* pointer to source 1
	//*
	//* //param[in] pi2_src2
	//* pointer to source 2
	//*
	//* //param[out] pu1_dst
	//* pointer to destination
	//*
	//* //param[in] src_strd1
	//* source stride 1
	//*
	//* //param[in] src_strd2
	//* source stride 2
	//*
	//* //param[in] dst_strd
	//* destination stride
	//*
	//* //param[in] wgt0
	//* weight to be multiplied to source 1
	//*
	//* //param[in] off0
	//* offset 0
	//*
	//* //param[in] wgt1
	//* weight to be multiplied to source 2
	//*
	//* //param[in] off1
	//* offset 1
	//*
	//* //param[in] shift
	//* (14 bit depth) + log2_weight_denominator
	//*
	//* //param[in] lvl_shift1
	//* added before shift and offset
	//*
	//* //param[in] lvl_shift2
	//* added before shift and offset
	//*
	//* //param[in] ht
	//* height of the source
	//*
	//* //param[in] wd
	//* width of the source
	//*
	//* //returns
	//*
	//* //remarks
	//* none
	//*
	//*******************************************************************************
	//*/

	//void ihevc_weighted_pred_bi(word16 *pi2_src1,
	// word16 *pi2_src2,
	// uword8 *pu1_dst,
	// word32 src_strd1,
	// word32 src_strd2,
	// word32 dst_strd,
	// word32 wgt0,
	// word32 off0,
	// word32 wgt1,
	// word32 off1,
	// word32 shift,
	// word32 lvl_shift1,
	// word32 lvl_shift2,
	// word32 ht,
	// word32 wd)

	//************variables vs registers***************************************
	// x0 => *pi2_src1
	// x1 => *pi2_src2
	// x2 => *pu1_dst
	// x3 => src_strd1
	// x4 => src_strd2
	// x5 => dst_strd
	// x6 => wgt0
	// x7 => off0
	// x8 => wgt1
	// x9 => off1
	// x10 => shift
	// x11 => lvl_shift1
	// x12 => lvl_shift2
	// x14 => ht
	// x7 => wd

	.text
	.align 4

	.include "ihevc_neon_macros.s"

	.globl ihevc_weighted_pred_bi_av8

	.type ihevc_weighted_pred_bi_av8, %function

	ihevc_weighted_pred_bi_av8:

	// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments

	ldr w8,[sp,#0]
	ldr w9,[sp,#8]
	ldr w10,[sp,#16]
	ldr w11,[sp,#24]
	ldr w12,[sp,#32]
	ldr w13,[sp,#40]
	ldr w14,[sp,#48]

	sxtw x8,w8
	sxtw x9,w9
	sxtw x10,w10
	sxtw x11,w11
	sxtw x12,w12


	stp x19, x20,[sp,#-16]!
	stp x21, x22,[sp,#-16]!
	stp x23, x24,[sp,#-16]!
	stp x25, x26,[sp,#-16]!

	mov x15,x4 // src_strd2 40
	mov x16,x5 // dst_strd 44
	mov x17,x6 // wgt0 48
	mov x19,x7 // off0 52
	mov x20,x8 // wgt1 56
	mov x21,x9 // off1 60
	mov x22,x10 // shift 64
	mov x23,x11 // lvl_shift1 68
	mov x24,x12 // lvl_shift2 72
	mov x25,x13 // ht 76
	mov x26,x14 // wd 80

	mov x6,x17 //load wgt0
	mov x11,x23 //load lvl_shift1
	mov x12,x24 //load lvl_shift2
	mov v7.h[0],w6 //moved for scalar multiplication
	mul x4, x11 , x6 //lvl_shift1 * wgt0
	mov x8,x20 //load wgt1
	mov x7,x19 //load off0
	mov v7.h[1],w8 //moved for scalar multiplication
	madd x4,x12,x8,x4 //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
	mov x9,x21 //load off1
	add x5,x7,x9 //off0 + off1
	mov x10,x22 //load shift
	add x5,x5,#1 //off0 + off1 + 1
	sub x14,x10,#1 //shift - 1
	mov x7,x26 //load wd
	lsl x5,x5,x14 //((off0 + off1 + 1) << (shift - 1))
	dup v28.4s,w10 //vmovq_n_s32(0-shift)
	add x4,x4,x5 //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
	dup v30.4s,w4 //vmovq_n_s32(tmp_lvl_shift)
	neg v28.4s, v28.4s
	mov x4,x15 //load src_strd2
	lsl x9,x7,#1
	mov x5,x16 //load dst_strd
	lsl x3,x3,#1
	mov x14,x25 //load ht
	lsl x4,x4,#1

	cmp x14,#0 //check ht == 0
	beq end_loops //if equal, then end the function

	outer_loop:
	cmp x7,#0 //check wd == 0
	beq end_loops //if equal, then end the function

	core_loop:
	add x6,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2src_strd1(2 because pi1_src is a 16 bit pointer)
	add x8,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2src_strd2(2 because pi2_src is a 16 bit pointer)
	ld1 {v0.4h},[x0],#8 //load and increment the pi2_src1
	add x10,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
	ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2
	smull v4.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
	ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration
	smull v5.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
	ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration
	add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)

	ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration
	smull v6.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration

	ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration
	add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
	smull v19.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration

	ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration
	smull v17.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
	sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t)

	ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration
	add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration

	sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
	smull v16.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration

	add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
	//mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
	add v19.4s, v19.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration

	sshl v6.4s,v6.4s,v28.4s
	//vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
	smull v18.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
	uqxtn v4.8b,v4.8h
	//vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3)
	add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration

	sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
	smull v20.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration

	sshl v19.4s,v19.4s,v28.4s
	//vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
	//mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

	add v18.4s, v18.4s , v20.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
	sqxtun v19.4h, v19.4s //vqmovun_s32(sto_res_tmp1) iii iteration

	add v18.4s, v18.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
	st1 {v4.s}[0],[x2],#4 //store pu1_dst i iteration

	uqxtn v6.8b,v6.8h
	//vqmovn.u16 d10,q5 //vqmovn_u16(sto_res_tmp3) ii iteration
	sshl v18.4s,v18.4s,v28.4s
	//vshl.s32 q9,q9,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
	st1 {v6.s}[0],[x10],x5 //store pu1_dst ii iteration


	//mov v15, v14 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
	uqxtn v19.8b,v19.8h
	//vqmovn.u16 d14,q7 //vqmovn_u16(sto_res_tmp3) iii iteration
	sqxtun v18.4h, v18.4s //vqmovun_s32(sto_res_tmp1) iv iteration
	//mov v19, v18 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
	st1 {v19.s}[0],[x10],x5 //store pu1_dst iii iteration
	uqxtn v18.8b,v18.8h
	//vqmovn.u16 d18,q9 //vqmovn_u16(sto_res_tmp3) iv iteration
	subs x7,x7,#4 //decrement wd by 4 and check for 0
	st1 {v18.s}[0],[x10],x5 //store pu1_dst iv iteration

	bgt core_loop //if greater than 0 repeat the core loop again

	end_core_loop:
	sub x20,x9,x3,lsl #2 //2*src_strd1 - wd
	neg x11, x20
	subs x14,x14,#4 //decrement the ht by 4
	sub x20,x9,x4,lsl #2 //2*src_strd2 - wd
	neg x12, x20
	add x0,x0,x11 //pi2_src1 + 4src_strd1 - 2wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
	asr x7,x9,#1
	add x1,x1,x12 //pi2_src2 + 4src_strd2 - 2wd
	sub x20,x7,x5,lsl #2 //2*dst_strd - wd
	neg x10, x20
	add x2,x2,x10 //pu1_dst + dst_std - wd
	bgt core_loop //if ht is greater than 0 goto outer_loop

	end_loops:
	// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
	ldp x25, x26,[sp],#16
	ldp x23, x24,[sp],#16
	ldp x21, x22,[sp],#16
	ldp x19, x20,[sp],#16

	ret