Decoder: Moved end of pic processing to end of decode call am: 494561291a am: ba7f9e2aed am: 26bb8b98ee am: 89fa016b5f am: df2fa01c9e am: 7944003371 am: fd5c8e352a am: 9432beb534
am: 7f690f1ada

Change-Id: Ibacaae4dd46de8f63ed5a94cd850fd870c18be59
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
index 6681a7c..e2b8c99 100644
--- a/common/arm/ih264_inter_pred_chroma_a9q.s
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -91,8 +91,8 @@
 @                             UWORD8 *pu1_dst,
 @                             WORD32 src_strd,
 @                             WORD32 dst_strd,
-@                             UWORD8 u1_dx,
-@                             UWORD8 u1_dy,
+@                             WORD32 u1_dx,
+@                             WORD32 u1_dy,
 @                             WORD32 ht,
 @                             WORD32 wd)
 @**************Variables Vs Registers*****************************************
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
index 0dd82f3..7597444 100644
--- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -413,7 +413,7 @@
 
     add           r7, r0, r4, lsl #3
     sub           r0, r7, r4, lsl #1
-    rsb           lr, r4, #0x0
+    neg           lr, r4
 
     vpadd.s16     d0, d0, d1
 
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
index 39ad9b3..b9595d7 100644
--- a/common/arm/ih264_mem_fns_neon.s
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -68,7 +68,7 @@
 @*
 @void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
 @                    UWORD8 *pu1_src,
-@                   UWORD8 num_bytes)
+@                   UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => *pu1_src
@@ -97,7 +97,7 @@
 @*
 @void ih264_memcpy(UWORD8 *pu1_dst,
 @                  UWORD8 *pu1_src,
-@                  UWORD8 num_bytes)
+@                  UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => *pu1_src
@@ -135,7 +135,7 @@
 
 @void ih264_memset_mul_8(UWORD8 *pu1_dst,
 @                       UWORD8 value,
-@                       UWORD8 num_bytes)
+@                       UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => value
@@ -202,7 +202,7 @@
 
 @void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
 @                                   UWORD16 value,
-@                                   UWORD8 num_words)
+@                                   UWORD32 num_words)
 @**************Variables Vs Registers*************************
 @   r0 => *pu2_dst
 @   r1 => value
@@ -234,7 +234,7 @@
 
 @void ih264_memset_16bit(UWORD16 *pu2_dst,
 @                       UWORD16 value,
-@                       UWORD8 num_words)
+@                       UWORD32 num_words)
 @**************Variables Vs Registers*************************
 @   r0 => *pu2_dst
 @   r1 => value
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
index e7a1f91..819b0b3 100644
--- a/common/arm/ih264_padding_neon.s
+++ b/common/arm/ih264_padding_neon.s
@@ -88,7 +88,7 @@
     stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
 
     sub           r5, r0, r1
-    rsb           r6, r1, #0
+    neg           r6, r1
 
 loop_neon_memcpy_mul_16:
     @ Load 16 bytes
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
index 33859e6..304bd8a 100644
--- a/common/arm/ih264_weighted_bi_pred_a9q.s
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -144,7 +144,7 @@
     ldr           r4, [sp, #40]         @Load src_strd2 in r4
     ldr           r5, [sp, #44]         @Load dst_strd in r5
     sxtb          r9, r9                @sign-extend 8-bit ofst1 to 32-bit
-    rsb           r10, r6, #0           @r13 = -(log_wd + 1)
+    neg           r10, r6               @r10 = -(log_wd + 1)
     ldr           r11, [sp, #68]        @Load ht in r11
     ldr           r12, [sp, #72]        @Load wd in r12
     vdup.16       q0, r10               @Q0  = -(log_wd + 1) (32-bit)
@@ -456,7 +456,7 @@
     ldr           r9, [sp, #60]         @Load ofst1 in r9
     ldr           r10, [sp, #64]        @Load ofst2 in r10
 
-    rsb           r12, r6, #0           @r12 = -(log_wd + 1)
+    neg           r12, r6               @r12 = -(log_wd + 1)
     ldr           r4, [sp, #40]         @Load src_strd2 in r4
     ldr           r5, [sp, #44]         @Load dst_strd in r5
     vdup.16       q0, r12               @Q0  = -(log_wd + 1) (16-bit)
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
index 81d26d4..80c2c6d 100644
--- a/common/arm/ih264_weighted_pred_a9q.s
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -122,7 +122,7 @@
     vpush         {d8-d15}
 
     vdup.16       d2, r5                @D2 = wt (16-bit)
-    rsb           r9, r4, #0            @r9 = -log_wd
+    neg           r9, r4                @r9 = -log_wd
     vdup.8        d3, r6                @D3 = ofst (8-bit)
     cmp           r8, #16               @check if wd is 16
     vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
@@ -349,7 +349,7 @@
     ldr           r6, [sp, #36]         @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
     ldr           r8, [sp, #44]         @Load wd
 
-    rsb           r9, r4, #0            @r9 = -log_wd
+    neg           r9, r4                @r9 = -log_wd
     vdup.32       q1, r5                @Q1 = {wt_u (16-bit), wt_v (16-bit)}
     ldr           r7, [sp, #40]         @Load ht
     vpush         {d8-d15}
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s
index a4dbd23..b7f2d58 100644
--- a/common/armv8/ih264_deblk_chroma_av8.s
+++ b/common/armv8/ih264_deblk_chroma_av8.s
@@ -56,19 +56,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
 //* @returns
@@ -87,6 +87,7 @@
     // STMFD sp!,{x4-x6,x14}            //
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
     mov       x6, x5
     mov       x5, x4
     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixel pointing to p1 of chroma
@@ -155,19 +156,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
 //* @returns
@@ -186,12 +187,13 @@
     // STMFD sp!,{x4,x5,x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     sub       x0, x0, #4                //point x0 to p1u of row0.
     mov       x12, x0                   //keep a back up of x0 for buffer write
 
-    add       x2, x2, x4, lsl #8        //x2 = (alpha_cr,alpha_cb)
-    add       x3, x3, x5, lsl #8        //x3 = (beta_cr,beta_cb)
+    add       w2, w2, w4, lsl #8        //w2 = (alpha_cr,alpha_cb)
+    add       w3, w3, w5, lsl #8        //w3 = (beta_cr,beta_cb)
 
     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
@@ -292,28 +294,28 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
 //*    tc0_table for U
 //*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
 //*    tc0_table for V
 //*
 //* @returns
@@ -332,14 +334,13 @@
     // STMFD sp!,{x4-x9,x14}        //
     push_v_regs
     stp       x19, x20, [sp, #-16]!
-    mov       x8, x7
-    mov       x7, x6
-    ldr       x9, [sp, #80]
+    sxtw      x1, w1
+    ldr       x8, [sp, #80]
     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
-    rev       w7, w7                    //
-    mov       v12.s[0], w7              //D12[0] = ui_Bs
-    ld1       {v16.s}[0], [x8]          //D16[0] contains cliptab_cb
-    ld1       {v17.s}[0], [x9]          //D17[0] contains cliptab_cr
+    rev       w6, w6                    //
+    mov       v12.s[0], w6              //D12[0] = ui_Bs
+    ld1       {v16.s}[0], [x7]          //D16[0] contains cliptab_cb
+    ld1       {v17.s}[0], [x8]          //D17[0] contains cliptab_cr
     ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
     tbl       v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
     tbl       v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
@@ -428,28 +429,28 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
 //*    tc0_table for U
 //*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
 //*    tc0_table for V
 //*
 //* @returns
@@ -468,11 +469,12 @@
     // STMFD sp!,{x4-x7,x10-x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
     mov       x10, x7
-    ldr       x11, [sp, #80]            //x6 = u4_bs
+    ldr       x11, [sp, #80]            //x11 = u4_bs
     sub       x0, x0, #4                //point x0 to p1u of row0.
-    add       x2, x2, x4, lsl #8
-    add       x3, x3, x5, lsl #8
+    add       w2, w2, w4, lsl #8
+    add       w3, w3, w5, lsl #8
     mov       x12, x0                   //keep a back up of x0 for buffer write
     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s
index 1b3950d..7705df2 100644
--- a/common/armv8/ih264_deblk_luma_av8.s
+++ b/common/armv8/ih264_deblk_luma_av8.s
@@ -60,19 +60,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
 //*    tc0_table
 //*
 //* @returns
@@ -90,6 +90,7 @@
 
     // STMFD sp!,{x4-x7,x14}
     push_v_regs
+    sxtw      x1, w1
     stp       x19, x20, [sp, #-16]!
 
     //LDRD            x4,x5,[SP,#0x14]        //x4 = ui_Bs , x5 = *puc_ClpTab
@@ -214,13 +215,13 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
 //* @returns
@@ -240,6 +241,7 @@
     // STMFD sp!,{x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     // Init
     dup       v0.16b, w2                //duplicate alpha
@@ -401,19 +403,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
 //*    tc0_table
 //*
 //* @returns
@@ -432,6 +434,7 @@
     // STMFD sp!,{x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     sub       x0, x0, #4                //pointer uc_edgePixel-4
     mov       x12, x4
@@ -743,13 +746,13 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
 //* @returns
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
index 6823015..d10047e 100644
--- a/common/armv8/ih264_default_weighted_pred_av8.s
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -88,18 +88,18 @@
 //                                          WORD32 src_strd1,
 //                                          WORD32 src_strd2,
 //                                          WORD32 dst_strd,
-//                                          UWORD8 ht,
-//                                          UWORD8 wd)
+//                                          WORD32 ht,
+//                                          WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => ht        (x6)
-//    [sp+12] => wd        (x7)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => ht
+//    w7      => wd
 //
 .text
 .p2align 2
@@ -113,6 +113,9 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     cmp       w7, #16
     beq       loop_16                   //branch if wd is 16
     cmp       w7, #8
@@ -263,18 +266,18 @@
 //                                            WORD32 src_strd1,
 //                                            WORD32 src_strd2,
 //                                            WORD32 dst_strd,
-//                                            UWORD8 ht,
-//                                            UWORD8 wd)
+//                                            WORD32 ht,
+//                                            WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => ht        (x6)
-//    [sp+12] => wd        (x7)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => ht
+//    w7      => wd
 //
 
 
@@ -286,6 +289,9 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     cmp       w7, #8
     beq       loop_8_uv                 //branch if wd is 8
     cmp       w7, #4
diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s
index 714e271..f6aef40 100644
--- a/common/armv8/ih264_inter_pred_chroma_av8.s
+++ b/common/armv8/ih264_inter_pred_chroma_av8.s
@@ -91,19 +91,19 @@
 //                             UWORD8 *pu1_dst,
 //                             WORD32 src_strd,
 //                             WORD32 dst_strd,
-//                             UWORD8 u1_dx,
-//                             UWORD8 u1_dy,
+//                             WORD32 u1_dx,
+//                             WORD32 u1_dy,
 //                             WORD32 ht,
 //                             WORD32 wd)
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  u1_dx
-//   x5 =>  u1_dy
-//    x6 =>  height
-//    x7 => width
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  u1_dx
+//    w5 =>  u1_dy
+//    w6 =>  height
+//    w7 =>  width
 //
 .text
 .p2align 2
@@ -120,6 +120,12 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    sxtw      x6, w6
+    sxtw      x7, w7
 
 
 
diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
index 6ad463a..e7c9f86 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
@@ -89,10 +89,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -111,6 +111,10 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     sub       x0, x0, #2                //pu1_src-2
     sub       x14, x4, #16
     movi      v0.8b, #5                 //filter coeff
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
index 9564f99..711d73e 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -89,10 +89,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -108,6 +108,10 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
 
diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s
index 1a76c1c..007df30 100644
--- a/common/armv8/ih264_inter_pred_luma_copy_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s
@@ -65,10 +65,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x7 =>  ht
-//    x12 => wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -82,6 +82,10 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     mov       x12, x5
     mov       x7, x4
@@ -228,14 +232,16 @@
 // Register Usage
 // x0 : pi2_src
 // x1 : pu1_out
-// x2 : src_strd
-// x3 : out_strd
+// w2 : src_strd
+// w3 : out_strd
 // Neon registers d0-d7, d16-d30 are used
 // No need for pushing  arm and neon registers
 
     .global ih264_interleave_copy_av8
 ih264_interleave_copy_av8:
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
     ld1       {v3.8b}, [x0], x2
     mov       v2.d[1], v3.d[0]
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index d2897b6..dd4383e 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -52,10 +52,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 
 .text
@@ -71,6 +71,10 @@
     //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     sub       x0, x0, #2                //pu1_src-2
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 546c807..3563ac0 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -105,12 +105,12 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x7 =>  dydx
-//    x9 => *pu1_tmp
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    x6 => *pu1_tmp
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -126,6 +126,10 @@
     // store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
 
@@ -134,7 +138,8 @@
 
     mov       x9, x6
 
-    lsr       x7, x7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+                                        // by writing to w7 here, we clear the upper half of x7
+    lsr       w7, w7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
 
     add       x7, x7, #2
     mov       x6, #48
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
index 39e3253..38268c7 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
@@ -94,11 +94,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//   x7 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -114,6 +114,10 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
     and       x7, x7, #3                //Finds x-offset
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
index 3f3e297..6ccf11f 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
@@ -105,12 +105,12 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x6 =>  dydx
-//    x9 => *pu1_tmp
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    x6 => *pu1_tmp
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -125,11 +125,15 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     sub       x0, x0, #2                //pu1_src-2
     mov       x9, x6
-    mov       x6, x7
+    mov       w6, w7
 
     and       x6, x6, #2                // dydx & 0x3 followed by dydx>>1 and dydx<<1
 
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
index ab663d0..a9dfbd1 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
@@ -104,11 +104,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x6 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -122,7 +122,11 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
-    mov       x6, x7
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    mov       w6, w7
     and       x7, x6, #3
     add       x7, x0, x7, lsr #1        //pu1_pred_vert = pu1_src + (x_offset>>1)
 
diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
index 9d19a2d..014faca 100644
--- a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
@@ -94,11 +94,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//   x7 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -112,6 +112,10 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
     and       x7, x7, #12               //Finds y-offset
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 8f0f282..39c0256 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -100,9 +100,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -113,13 +113,14 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
-    mov       x19, #5
-    ands      x6, x4, x19
+    mov       w19, #5
+    ands      w6, w4, w19
     beq       none_available
-    cmp       x6, #1
+    cmp       w6, #1
     beq       left_only_available
-    cmp       x6, #4
+    cmp       w6, #4
     beq       top_only_available
 
 all_available:
@@ -251,9 +252,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_chroma_8x8_mode_horz_av8
@@ -263,6 +264,7 @@
 
 
     push_v_regs
+    sxtw      x3, w3
     ld1       {v0.8h}, [x0]
 
     dup       v10.8h, v0.h[7]
@@ -332,9 +334,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_chroma_8x8_mode_vert_av8
@@ -342,6 +344,7 @@
 ih264_intra_pred_chroma_8x8_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     add       x0, x0, #18
     ld1       {v0.8b, v1.8b}, [x0]
@@ -405,15 +408,16 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_chroma_8x8_mode_plane_av8
 ih264_intra_pred_chroma_8x8_mode_plane_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.2s}, [x0]
     add       x10, x0, #10
@@ -457,18 +461,14 @@
     rshrn     v13.4h, v26.4s, #6
     rshrn     v14.4h, v28.4s, #6
     ldrb      w6, [x0], #1
-    sxtw      x6, w6
     add       x10, x0, #31
     ldrb      w8, [x0], #1
-    sxtw      x8, w8
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x6, x6, x7
-    add       x8, x8, x9
-    lsl       x6, x6, #4
-    lsl       x8, x8, #4
+    add       w6, w6, w7
+    add       w8, w8, w9
+    lsl       w6, w6, #4
+    lsl       w8, w8, #4
     dup       v0.8h, w6
     dup       v2.8h, w8
     dup       v4.8h, v12.h[0]
diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
index c1847b5..fa19c12 100644
--- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
@@ -98,9 +98,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_16x16_mode_vert_av8
@@ -108,6 +108,7 @@
 ih264_intra_pred_luma_16x16_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
 
     add       x0, x0, #17
@@ -181,9 +182,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_horz_av8
 
@@ -192,6 +193,7 @@
 
 
     push_v_regs
+    sxtw      x3, w3
 
     ld1       {v0.16b}, [x0]
 
@@ -283,9 +285,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_dc_av8
 
@@ -295,18 +297,19 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     sub       v0.16b, v0.16b, v0.16b
     sub       v1.16b, v1.16b, v1.16b
     mov       w10, #0
     mov       w11 , #3
-    ands      x6, x4, #0x01
+    ands      w6, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
     ld1       {v0.16b}, [x0]
     add       w10, w10, #8
     add       w11, w11, #1
 top_available:
-    ands      x6, x4, #0x04
+    ands      w6, w4, #0x04
     beq       none_available
     add       x6, x0, #17
     ld1       {v1.16b}, [x6]
@@ -314,7 +317,7 @@
     add       w11, w11, #1
     b         summation
 none_available:
-    cmp       x4, #0
+    cmp       w4, #0
     bne       summation
     mov       w15, #128
     dup       v20.16b, w15
@@ -410,15 +413,16 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_plane_av8
 ih264_intra_pred_luma_16x16_mode_plane_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     mov       x2, x1
     add       x1, x0, #17
     add       x0, x0, #15
@@ -440,76 +444,58 @@
     uxtl      v18.8h, v7.8b
     add       x7, x0, x4, lsl #3
     sub       x0, x7, x4, lsl #1
-    sub       x20, x4, #0x0
-    neg       x14, x20
+    neg       x14, x4
     addp      v0.8h, v0.8h, v1.8h
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     saddlp    v0.2s, v0.4h
-    sub       x12, x8, x9
+    sub       w12, w8, w9
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     saddlp    v0.1d, v0.2s
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     shl       v2.2s, v0.2s, #2
-    add       x12, x12, x8, lsl #1
+    add       w12, w12, w8, lsl #1
     add       v0.2s, v0.2s , v2.2s
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
-    add       x8, x8, x8, lsl #1
+    add       w8, w8, w8, lsl #1
     dup       v4.8h, v0.h[0]
-    add       x12, x12, x8
+    add       w12, w12, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     mul       v0.8h, v4.8h , v16.8h
-    sub       x5, x5, x9
+    sub       w5, w5, w9
     mul       v2.8h, v4.8h , v18.8h
-    add       x12, x12, x5, lsl #2
+    add       w12, w12, w5, lsl #2
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
-    add       x8, x8, x8, lsl #2
+    add       w8, w8, w8, lsl #2
     ldrb      w6, [x0], #1
-    sxtw      x6, w6
-    add       x12, x12, x8
+    add       w12, w12, w8
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x5, x5, x6
-    sub       x8, x8, x9
-    add       x5, x5, x5, lsl #1
-    sub       x20, x8, x8, lsl #3
-    neg       x8, x20
-    add       x12, x12, x5, lsl #1
+    sub       w5, w5, w6
+    sub       w8, w8, w9
+    add       w5, w5, w5, lsl #1
+    sub       w20, w8, w8, lsl #3
+    neg       w8, w20
+    add       w12, w12, w5, lsl #1
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
     ldrb      w6, [x10]                 //top_left
-    sxtw      x6, w6
-    add       x12, x12, x8
-    sub       x9, x5, x6
+    add       w12, w12, w8
+    sub       w9, w5, w6
     ldrb      w6, [x1, #7]
-    sxtw      x6, w6
-    add       x12, x12, x9, lsl #3      // i_c = x12
-    add       x8, x5, x6
-    add       x12, x12, x12, lsl #2
-    lsl       x8, x8, #4                // i_a = x8
-    add       x12, x12, #0x20
-    lsr       x12, x12, #6
+    add       w12, w12, w9, lsl #3      // i_c = w12
+    add       w8, w5, w6
+    add       w12, w12, w12, lsl #2
+    lsl       w8, w8, #4                // i_a = w8
+    add       w12, w12, #0x20
+    lsr       w12, w12, #6
     shl       v28.8h, v4.8h, #3
     dup       v6.8h, w12
     dup       v30.8h, w8
diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
index 62e8cee..1f95131 100644
--- a/common/armv8/ih264_intra_pred_luma_4x4_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
@@ -102,15 +102,16 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_4x4_mode_vert_av8
 
 ih264_intra_pred_luma_4x4_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     add       x0, x0, #5
 
@@ -171,9 +172,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -182,6 +183,7 @@
 ih264_intra_pred_luma_4x4_mode_horz_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     ld1       {v1.s}[0], [x0]
     dup       v0.8b, v1.b[3]
@@ -246,9 +248,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -261,41 +263,34 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
-    ands      x5, x4, #0x01
+    ands      w5, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
 
     add       x10, x0, #3
     mov       x2, #-1
     ldrb      w5, [x10], #-1
-    sxtw      x5, w5
     ldrb      w6, [x10], #-1
-    sxtw      x6, w6
     ldrb      w7, [x10], #-1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], #-1
-    sxtw      x8, w8
-    add       x5, x5, x7
-    ands      x11, x4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
-    add       x5, x5, x8
+    add       w5, w5, w7
+    ands      w11, w4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    add       w5, w5, w8
     beq       left_available
     add       x10, x0, #5
     //    BOTH LEFT AND TOP AVAILABLE
     ldrb      w6, [x10], #1
-    sxtw      x6, w6
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], #1
-    sxtw      x8, w8
-    add       x5, x5, x7
+    add       w5, w5, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x5, x5, x8
-    add       x5, x5, x9
-    add       x5, x5, #4
-    lsr       x5, x5, #3
+    add       w5, w5, w8
+    add       w5, w5, w9
+    add       w5, w5, #4
+    lsr       w5, w5, #3
     dup       v0.8b, w5
     st1       {v0.s}[0], [x1], x3
     st1       {v0.s}[0], [x1], x3
@@ -304,23 +299,19 @@
     b         end_func
 
 top_available: // ONLT TOP AVAILABLE
-    ands      x11, x4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
+    ands      w11, w4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
     beq       none_available
 
     add       x10, x0, #5
     ldrb      w6, [x10], #1
-    sxtw      x6, w6
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
     ldrb      w8, [x10], #1
-    sxtw      x8, w8
-    add       x5, x6, x7
+    add       w5, w6, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x5, x5, x8
-    add       x5, x5, x9
-    add       x5, x5, #2
-    lsr       x5, x5, #2
+    add       w5, w5, w8
+    add       w5, w5, w9
+    add       w5, w5, #2
+    lsr       w5, w5, #2
     dup       v0.8b, w5
     st1       {v0.s}[0], [x1], x3
     st1       {v0.s}[0], [x1], x3
@@ -401,9 +392,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8
@@ -413,6 +404,7 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #5
     sub       x5, x3, #2
@@ -488,9 +480,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8
@@ -499,6 +491,7 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       {v0.8b}, [x0]
@@ -571,9 +564,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_vert_r_av8
@@ -582,6 +575,7 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       {v0.8b}, [x0]
@@ -656,9 +650,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_horz_d_av8
@@ -667,6 +661,7 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.8b}, [x0]
     add       x0, x0, #1
@@ -743,9 +738,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_vert_l_av8
@@ -754,6 +749,7 @@
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #4
     ld1       {v0.8b}, [x0]
     add       x0, x0, #1
@@ -825,9 +821,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_horz_u_av8
@@ -835,11 +831,11 @@
 ih264_intra_pred_luma_4x4_mode_horz_u_av8:
 
     push_v_regs
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
     mov       x10, x0
     ld1       {v0.8b}, [x0]
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     ext       v1.8b, v0.8b , v0.8b , #1
     ld1       {v0.b}[7], [x10]
     ext       v2.8b, v1.8b , v1.8b , #1
diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
index bf9a4c1..273aa81 100644
--- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
@@ -102,9 +102,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_av8
@@ -114,6 +114,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     //stp x19, x20,[sp,#-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #9
     ld1       {v0.8b}, [x0]
@@ -180,9 +181,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_horz_av8
@@ -194,38 +195,30 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #7
-    mov       x2 , #-1
 
     ldrb      w5, [x0], #-1
-    sxtw      x5, w5
     ldrb      w6, [x0], #-1
-    sxtw      x6, w6
     dup       v0.8b, w5
     st1       {v0.8b}, [x1], x3
     ldrb      w7, [x0], #-1
-    sxtw      x7, w7
     dup       v1.8b, w6
     st1       {v1.8b}, [x1], x3
     dup       v2.8b, w7
     ldrb      w8, [x0], #-1
-    sxtw      x8, w8
     dup       v3.8b, w8
     st1       {v2.8b}, [x1], x3
     ldrb      w5, [x0], #-1
-    sxtw      x5, w5
     st1       {v3.8b}, [x1], x3
     dup       v0.8b, w5
     ldrb      w6, [x0], #-1
-    sxtw      x6, w6
     st1       {v0.8b}, [x1], x3
     ldrb      w7, [x0], #-1
-    sxtw      x7, w7
     dup       v1.8b, w6
     dup       v2.8b, w7
     st1       {v1.8b}, [x1], x3
     ldrb      w8, [x0], #-1
-    sxtw      x8, w8
     dup       v3.8b, w8
     st1       {v2.8b}, [x1], x3
     st1       {v3.8b}, [x1], x3
@@ -285,9 +278,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_dc_av8
@@ -298,37 +291,30 @@
 
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
-    ands      x6, x4, #0x01
+    ands      w6, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
 
     add       x10, x0, #7
     mov       x2, #-1
     ldrb      w5, [x10], -1
-    sxtw      x5, w5
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
     ldrb      w7, [x10], -1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], -1
-    sxtw      x8, w8
-    add       x5, x5, x7
+    add       w5, w5, w7
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
-    add       x5, x5, x8
+    add       w5, w5, w8
     ldrb      w7, [x10], -1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], -1
-    sxtw      x8, w8
-    add       x5, x5, x7
-    ands      x11, x4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
-    add       x5, x5, x8
+    add       w5, w5, w7
+    ands      w11, w4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    add       w5, w5, w8
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
-    add       x5, x5, x6
+    add       w5, w5, w6
     beq       left_available
     add       x10, x0, #9
     //    BOTH LEFT AND TOP AVAILABLE
@@ -351,7 +337,7 @@
     b         end_func
 
 top_available: // ONLT TOP AVAILABLE
-    ands      x11, x4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
+    ands      w11, w4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
     beq       none_available
 
     add       x10, x0, #9
@@ -452,9 +438,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8
 
@@ -463,6 +449,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #9
     sub       x5, x3, #4
@@ -554,9 +541,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8
@@ -566,6 +553,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       { v0.16b}, [x0]
@@ -654,9 +642,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_r_av8
@@ -666,6 +654,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -780,9 +769,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_horz_d_av8
 
@@ -791,6 +780,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -910,9 +900,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_l_av8
@@ -922,6 +912,7 @@
     // STMFD sp!, {x4-x12, x14}         //Restoring registers from stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #9
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -1018,9 +1009,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_horz_u_av8
 
@@ -1029,6 +1020,7 @@
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.8b}, [x0]
     ld1       {v1.b}[7], [x0]
diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s
index 4c83036..003ee74 100644
--- a/common/armv8/ih264_iquant_itrans_recon_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_av8.s
@@ -103,11 +103,11 @@
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //   =>  pi4_tmp
 //   =>  iq_start_idx
 //   =>  pi2_dc_ld_addr
@@ -119,6 +119,8 @@
 ih264_iquant_itrans_recon_4x4_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
 
@@ -292,11 +294,11 @@
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //sp =>  pi4_tmp
 //sp#8 => *pi2_dc_src
 
@@ -315,6 +317,8 @@
 
     //reduce sp by 64
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
 
@@ -512,11 +516,11 @@
 //x0       => *pi2_src
 //x1       => *pu1_pred
 //x2       => *pu1_out
-//x3       =>  pred_strd
-//x4       =>  out_strd
+//w3       =>  pred_strd
+//w4       =>  out_strd
 //x5       =>  *pu2_iscal_mat
 //x6       =>  *pu2_weigh_mat
-//x7       =>  u4_qp_div_6
+//w7       =>  u4_qp_div_6
 //NOT USED =>  pi4_tmp
 //NOT USED =>  iq_start_idx
 //NOT USED =>  pi2_dc_ld_addr
@@ -525,6 +529,8 @@
 ih264_iquant_itrans_recon_8x8_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     ld1       {v8.8h -v11.8h}, [x5], #64
     ld1       {v12.8h-v15.8h}, [x5]
diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
index 8bb9c32..13061ec 100644
--- a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
@@ -104,11 +104,11 @@
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //   =>  pi4_tmp
 //   =>  iq_start_idx
 //   =>  pi2_dc_ld_addr
@@ -119,6 +119,8 @@
     .global ih264_iquant_itrans_recon_4x4_dc_av8
 ih264_iquant_itrans_recon_4x4_dc_av8:
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #8]              //Loads iq_start_idx
     subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
 
@@ -209,11 +211,11 @@
 // x0 : pi2_src
 // x1 : pu1_pred
 // x2 : pu1_out
-// x3 : pred_strd
-// x4 : out_strd
+// w3 : pred_strd
+// w4 : out_strd
 // x5 : pu2_iscal_mat
 // x6 : pu2_weigh_mat
-// x7 : u4_qp_div_6
+// w7 : u4_qp_div_6
 //    : pi2_tmp
 //    : pi2_dc_src
 // Neon registers d0-d7, d16-d30 are used
@@ -223,6 +225,8 @@
     .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
 ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       x0, [sp, #8]
     push_v_regs
     ld1       {v0.h}[0], [x0]
@@ -327,11 +331,11 @@
 //x0       => *pi2_src
 //x1       => *pu1_pred
 //x2       => *pu1_out
-//x3       =>  pred_strd
-//x4       =>  out_strd
+//w3       =>  pred_strd
+//w4       =>  out_strd
 //x5       =>  *pu2_iscal_mat
 //x6       =>  *pu2_weigh_mat
-//x7       =>  u4_qp_div_6
+//w7       =>  u4_qp_div_6
 //NOT USED =>  pi4_tmp
 //NOT USED =>  iq_start_idx
 //NOT USED =>  pi2_dc_ld_addr
@@ -340,6 +344,8 @@
 ih264_iquant_itrans_recon_8x8_dc_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     ld1       {v1.h}[0], [x5]
     ld1       {v2.h}[0], [x6]
diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s
index 4e9020d..802550d 100644
--- a/common/armv8/ih264_mem_fns_neon_av8.s
+++ b/common/armv8/ih264_mem_fns_neon_av8.s
@@ -70,11 +70,11 @@
 //*/
 //void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
 //                      UWORD8 *pu1_src,
-//                      UWORD8 num_bytes)
+//                      UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => *pu1_src
-//    x2 => num_bytes
+//    w2 => num_bytes
 
 
 
@@ -89,7 +89,7 @@
     ld1       {v0.8b}, [x1], #8
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_neon_memcpy_mul_8
     ret
 
@@ -99,38 +99,36 @@
 //*/
 //void ih264_memcpy(UWORD8 *pu1_dst,
 //                  UWORD8 *pu1_src,
-//                  UWORD8 num_bytes)
+//                  UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => *pu1_src
-//    x2 => num_bytes
+//    w2 => num_bytes
 
 
 
     .global ih264_memcpy_av8
 
 ih264_memcpy_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memcpy
 loop_neon_memcpy:
     // Memcpy 8 bytes
     ld1       {v0.8b}, [x1], #8
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memcpy
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func1
 
 arm_memcpy:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memcpy:
     ldrb      w3, [x1], #1
-    sxtw      x3, w3
     strb      w3, [x0], #1
-    sxtw      x3, w3
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memcpy
     ret
 end_func1:
@@ -139,7 +137,7 @@
 
 //void ih264_memset_mul_8(UWORD8 *pu1_dst,
 //                       UWORD8 value,
-//                       UWORD8 num_bytes)
+//                       UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => value
@@ -156,7 +154,7 @@
     // Memset 8 bytes
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_memset_mul_8
 
     ret
@@ -164,36 +162,35 @@
 
 //void ih264_memset(UWORD8 *pu1_dst,
 //                       UWORD8 value,
-//                       UWORD8 num_bytes)
+//                       UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
-//    x1 => value
-//    x2 => num_bytes
+//    w1 => value
+//    w2 => num_bytes
 
 
 
     .global ih264_memset_av8
 
 ih264_memset_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memset
     dup       v0.8b, w1
 loop_neon_memset:
     // Memcpy 8 bytes
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memset
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func2
 
 arm_memset:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memset:
     strb      w1, [x0], #1
-    sxtw      x1, w1
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memset
     ret
 end_func2:
@@ -205,11 +202,11 @@
 
 //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
 //                                      UWORD16 value,
-//                                      UWORD8 num_words)
+//                                      UWORD32 num_words)
 //**************Variables Vs Registers*************************
 //    x0 => *pu2_dst
-//    x1 => value
-//    x2 => num_words
+//    w1 => value
+//    w2 => num_words
 
 
     .global ih264_memset_16bit_mul_8_av8
@@ -224,7 +221,7 @@
     st1       {v0.4h}, [x0], #8
     st1       {v0.4h}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_memset_16bit_mul_8
 
     ret
@@ -233,18 +230,18 @@
 
 //void ih264_memset_16bit(UWORD16 *pu2_dst,
 //                       UWORD16 value,
-//                       UWORD8 num_words)
+//                       UWORD32 num_words)
 //**************Variables Vs Registers*************************
 //    x0 => *pu2_dst
-//    x1 => value
-//    x2 => num_words
+//    w1 => value
+//    w2 => num_words
 
 
 
     .global ih264_memset_16bit_av8
 
 ih264_memset_16bit_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memset_16bit
     dup       v0.4h, w1
 loop_neon_memset_16bit:
@@ -252,18 +249,17 @@
     st1       {v0.4h}, [x0], #8
     st1       {v0.4h}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memset_16bit
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func3
 
 arm_memset_16bit:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memset_16bit:
     strh      w1, [x0], #2
-    sxtw      x1, w1
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memset_16bit
     ret
 
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
index 35d9c8a..e03fe2f 100644
--- a/common/armv8/ih264_padding_neon_av8.s
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -76,9 +76,9 @@
 //                   WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => wd
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => wd
+//    w3 => pad_size
 
     .global ih264_pad_top_av8
 
@@ -86,25 +86,25 @@
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
     stp       x19, x20, [sp, #-16]!
 
     sub       x5, x0, x1
-    sub       x20, x1, #0
-    neg       x6, x20
+    neg       x6, x1
 
 loop_neon_memcpy_mul_16:
     // Load 16 bytes
     ld1       {v0.8b, v1.8b}, [x0], #16
     mov       x4, x5
-    mov       x7, x3
+    mov       w7, w3
     add       x5, x5, #16
 
 loop_neon_pad_top:
     st1       {v0.8b, v1.8b}, [x4], x6
-    subs      x7, x7, #1
+    subs      w7, w7, #1
     bne       loop_neon_pad_top
 
-    subs      x2, x2, #16
+    subs      w2, w2, #16
     bne       loop_neon_memcpy_mul_16
 
     // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
@@ -160,9 +160,9 @@
 //                        WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -172,6 +172,8 @@
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
 
@@ -182,43 +184,35 @@
 loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16
@@ -227,14 +221,11 @@
 loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -243,35 +234,30 @@
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     dup       v0.16b, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32
@@ -333,9 +319,9 @@
 //                            WORD32 pad_size)
 //{
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -345,6 +331,8 @@
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     sub       x4, x0, x3
@@ -354,27 +342,23 @@
 loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -383,27 +367,23 @@
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -412,20 +392,16 @@
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
@@ -500,9 +476,9 @@
 //}
 //
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -512,6 +488,8 @@
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -522,43 +500,35 @@
 loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16_r
@@ -567,14 +537,11 @@
 loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -583,35 +550,30 @@
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32_r
@@ -672,9 +634,9 @@
 //                        WORD32 ht,
 //                        WORD32 pad_size)
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -684,6 +646,8 @@
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -692,24 +656,20 @@
 loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
@@ -720,27 +680,23 @@
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -748,20 +704,16 @@
     bne       loop_32_r_c
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s
index 316c220..d2ba3cf 100644
--- a/common/armv8/ih264_resi_trans_quant_av8.s
+++ b/common/armv8/ih264_resi_trans_quant_av8.s
@@ -45,18 +45,6 @@
 //* function name     : ih264_resi_trans_quant_4x4
 //* description       : this function does cf4 of h264
 //*
-//* arguments         :   x0 :pointer to src buffer
-//                        x1 :pointer to pred buffer
-//                        x2 :pointer to dst buffer
-//                        x3 :source stride
-//                        x4 :pred stride,
-//                        x5 :dst stride,
-//                        x6 :pointer to scaling matrix,
-//                        x7 :pointer to threshold matrix,
-//                        stack   qbits,
-//                                rounding factor,
-//                                pointer to store nnz
-//                                pointer to store non quantized dc value
 // values returned   : none
 //
 // register usage    :
@@ -77,34 +65,24 @@
     .global ih264_resi_trans_quant_4x4_av8
 ih264_resi_trans_quant_4x4_av8:
 
-    //x0     :pointer to src buffer
-    //x1     :pointer to pred buffer
-    //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
-    //x5     :dst stride,
-    //x6     :scale matirx,
-    //x7     :threshold matrix
-    //       :qbits
-    //       :round factor
-    //       :nnz
-    //       :pointer to store non quantized dc value
     push_v_regs
     //x0     :pointer to src buffer
     //x1     :pointer to pred buffer
     //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
-    //x5     :scale matirx,
+    //w3     :source stride
+    //w4     :pred stride
+    //w5     :scale matirx,
     //x6     :threshold matrix
-    //x7     :qbits
-    //x8        :round factor
+    //w7     :qbits
+    //w8        :round factor
     //x9        :nnz
     //x10       :pointer to store non quantized dc value
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #64]             //load round factor
     ldr       x10, [sp, #80]            //load addres for non quant val
-    neg       x7, x7                    //negate the qbit value for usiing lsl
+    neg       w7, w7                    //negate the qbit value for usiing lsl
     ldr       x9, [sp, #72]
 
     //------------fucntion loading done----------------;
@@ -259,18 +237,6 @@
 //* description       : this function does residue calculation, forward transform
 //*                        and quantization for 4x4 chroma block.
 //*
-//* arguments         :   x0 :pointer to src buffer
-//                        x1 :pointer to pred buffer
-//                        x2 :pointer to dst buffer
-//                        x3 :source stride
-//                        x4 :pred stride,
-//                        x5 :dst stride,
-//                        x6 :pointer to scaling matrix,
-//                        x7 :pointer to threshold matrix,
-//                        stack     qbits,
-//                                  rounding factor,
-//                                  pointer to store nnz
-//                                  pointer to store unquantized dc values
 // values returned   : none
 //
 // register usage    :
@@ -290,33 +256,24 @@
     .global ih264_resi_trans_quant_chroma_4x4_av8
 ih264_resi_trans_quant_chroma_4x4_av8:
 
-    //x0     :pointer to src buffer
-    //x1     :pointer to pred buffer
-    //x2     :pointer to dst buffer
-    //x3     :source stride
-    //stack     :pred stride
-    //          :scale matirx,
-    //          :threshold matrix
-    //          :qbits
-    //          :round factor
-    //          :nnz
-    //          :pu1_dc_alt_addr
     push_v_regs
     //x0     :pointer to src buffer
     //x1     :pointer to pred buffer
     //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
+    //w3     :source stride
+    //w4     :pred stride
     //x5     :scale matirx,
     //x6     :threshold matrix
-    //x7     :qbits
-    //x8        :round factor
+    //w7     :qbits
+    //w8        :round factor
     //x9        :nnz
     //x10       :pointer to store non quantized dc value
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #64]             //load round factor
     ldr       x10, [sp, #80]            //load addres for non quant val
-    neg       x7, x7                    //negate the qbit value for usiing lsl
+    neg       w7, w7                    //negate the qbit value for usiing lsl
     ldr       x9, [sp, #72]
     //------------fucntion loading done----------------;
 
@@ -485,10 +442,10 @@
 //* arguments         :  x0 :pointer to src buffer
 //                       x1 :pointer to dst buffer
 //                       x2 :pu2_scale_matrix
-//                       x2 :pu2_threshold_matrix
-//                       x3 :u4_qbits
-//                       x4 :u4_round_factor
-//                       x5 :pu1_nnz
+//                       x3 :pu2_threshold_matrix
+//                       w4 :u4_qbits
+//                       w5 :u4_round_factor
+//                       x6 :pu1_nnz
 // values returned   : none
 //
 // register usage    :
@@ -516,8 +473,8 @@
 //x1 :pointer to dst buffer
 //x2 :pu2_scale_matrix
 //x3 :pu2_threshold_matrix
-//x4 :u4_qbits
-//x5 :u4_round_factor
+//w4 :u4_qbits
+//w5 :u4_round_factor
 //x6 :pu1_nnz
 
     push_v_regs
@@ -632,10 +589,10 @@
 //* arguments         :  x0 :pointer to src buffer
 //                       x1 :pointer to dst buffer
 //                       x2 :pu2_scale_matrix
-//                       x2 :pu2_threshold_matrix
-//                       x3 :u4_qbits
-//                       x4 :u4_round_factor
-//                       x5 :pu1_nnz
+//                       x3 :pu2_threshold_matrix
+//                       w4 :u4_qbits
+//                       w5 :u4_round_factor
+//                       x6 :pu1_nnz
 // values returned   : none
 //
 // register usage    :
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index b039fba..475f690 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -103,28 +103,28 @@
 //                                     WORD32 src_strd1,
 //                                     WORD32 src_strd2,
 //                                     WORD32 dst_strd,
-//                                     UWORD16 log_WD,
-//                                     UWORD32 wt1,
-//                                     UWORD32 wt2,
-//                                     UWORD16 ofst1,
-//                                     UWORD16 ofst2,
-//                                     UWORD8 ht,
-//                                     UWORD8 wd)
+//                                     WORD32 log_WD,
+//                                     WORD32 wt1,
+//                                     WORD32 wt2,
+//                                     WORD16 ofst1,
+//                                     WORD16 ofst2,
+//                                     WORD32 ht,
+//                                     WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => log_WD    (x6)
-//    [sp+12] => wt1       (x7)
-//   [sp+16] => wt2       (x8)
-//   [sp+20] => ofst1     (x9)
-//   [sp+24] => ofst2     (x10)
-//    [sp+28] => ht        (x11)
-//    [sp+32] => wd        (x12)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => log_WD
+//    w7      => wt1
+//    [sp]    => wt2       (w8)
+//    [sp+8]  => ofst1     (w9)
+//    [sp+16] => ofst2     (w10)
+//    [sp+24] => ht        (w11)
+//    [sp+32] => wd        (w12)
 //
 .text
 .p2align 2
@@ -138,21 +138,23 @@
 
     // STMFD sp!, {x4-x12,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     stp       x19, x20, [sp, #-16]!
-    ldr       x8, [sp, #80]             //Load wt2 in x8
-    ldr       x9, [sp, #88]             //Load ofst1 in x9
-    add       x6, x6, #1                //x6  = log_WD + 1
-    sub       x20, x6, #0               //x13 = -(log_WD + 1)
-    neg       x10, x20
+    ldr       w8, [sp, #80]             //Load wt2 in w8
+    ldr       w9, [sp, #88]             //Load ofst1 in w9
+    add       w6, w6, #1                //w6  = log_WD + 1
+    neg       w10, w6                   //w10 = -(log_WD + 1)
     dup       v0.8h, w10                //Q0  = -(log_WD + 1) (32-bit)
-    ldr       x10, [sp, #96]            //Load ofst2 in x10
-    ldr       x11, [sp, #104]           //Load ht in x11
-    ldr       x12, [sp, #112]           //Load wd in x12
-    add       x9, x9, #1                //x9 = ofst1 + 1
-    add       x9, x9, x10               //x9 = ofst1 + ofst2 + 1
+    ldr       w10, [sp, #96]            //Load ofst2 in w10
+    ldr       w11, [sp, #104]           //Load ht in w11
+    ldr       w12, [sp, #112]           //Load wd in w12
+    add       w9, w9, #1                //w9 = ofst1 + 1
+    add       w9, w9, w10               //w9 = ofst1 + ofst2 + 1
     mov       v2.s[0], w7
     mov       v2.s[1], w8               //D2 = {wt1(32-bit), wt2(32-bit)}
-    asr       x9, x9, #1                //x9 = ofst = (ofst1 + ofst2 + 1) >> 1
+    asr       w9, w9, #1                //w9 = ofst = (ofst1 + ofst2 + 1) >> 1
     dup       v3.8b, w9                 //D3 = ofst (8-bit)
     cmp       w12, #16
     beq       loop_16                   //branch if wd is 16
@@ -383,28 +385,28 @@
 //                                       WORD32 src_strd1,
 //                                       WORD32 src_strd2,
 //                                       WORD32 dst_strd,
-//                                       UWORD16 log_WD,
-//                                       UWORD32 wt1,
-//                                       UWORD32 wt2,
-//                                       UWORD16 ofst1,
-//                                       UWORD16 ofst2,
-//                                       UWORD8 ht,
-//                                       UWORD8 wd)
+//                                       WORD32 log_WD,
+//                                       WORD32 wt1,
+//                                       WORD32 wt2,
+//                                       WORD32 ofst1,
+//                                       WORD32 ofst2,
+//                                       WORD32 ht,
+//                                       WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => log_WD    (x6)
-//    [sp+12] => wt1       (x7)
-//   [sp+16] => wt2       (x8)
-//   [sp+20] => ofst1     (x9)
-//   [sp+24] => ofst2     (x10)
-//    [sp+28] => ht        (x11)
-//    [sp+32] => wd        (x12)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => log_WD
+//    w7      => wt1
+//    [sp]    => wt2       (w8)
+//    [sp+8]  => ofst1     (w9)
+//    [sp+16] => ofst2     (w10)
+//    [sp+24] => ht        (w11)
+//    [sp+32] => wd        (w12)
 //
 
 
@@ -417,24 +419,22 @@
 
     // STMFD sp!, {x4-x12,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     stp       x19, x20, [sp, #-16]!
 
 
-    ldr       x8, [sp, #80]             //Load wt2 in x8
+    ldr       w8, [sp, #80]             //Load wt2 in w8
     dup       v4.4s, w8                 //Q2 = (wt2_u, wt2_v) (32-bit)
     dup       v2.4s, w7                 //Q1 = (wt1_u, wt1_v) (32-bit)
-    add       x6, x6, #1                //x6  = log_WD + 1
-    ldr       w9, [sp, #88]             //Load ofst1 in x9
-    sxtw      x9, w9
-    ldr       w10, [sp, #96]            //Load ofst2 in x10
-    sxtw      x10, w10
-    sub       x20, x6, #0               //x12 = -(log_WD + 1)
-    neg       x20, x20
+    add       w6, w6, #1                //w6  = log_WD + 1
+    ldr       w9, [sp, #88]             //Load ofst1 in w9
+    ldr       w10, [sp, #96]            //Load ofst2 in w10
+    neg       w20, w6                   //w20 = -(log_WD + 1)
     dup       v0.8h, w20                //Q0  = -(log_WD + 1) (16-bit)
     ldr       w11, [sp, #104]           //Load ht in x11
     ldr       w12, [sp, #112]           //Load wd in x12
-    sxtw      x11, w11
-    sxtw      x12, w12
     dup       v20.8h, w9                //0ffset1
     dup       v21.8h, w10               //0ffset2
     srhadd    v6.8b, v20.8b, v21.8b
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index 69ed3b0..f145217 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -89,22 +89,22 @@
 //                                  UWORD8 *puc_dst,
 //                                  WORD32 src_strd,
 //                                  WORD32 dst_strd,
-//                                  UWORD8 log_WD,
-//                                  UWORD32 wt,
-//                                  UWORD16 ofst,
-//                                  UWORD8 ht,
-//                                  UWORD8 wd)
+//                                  WORD32 log_WD,
+//                                  WORD32 wt,
+//                                  WORD32 ofst,
+//                                  WORD32 ht,
+//                                  WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src
 //    x1      => puc_dst
-//    x2      => src_strd
-//    x3      => dst_strd
-//    [sp]    => log_WD (x4)
-//    [sp+4]  => wt     (x5)
-//   [sp+8]  => ofst   (x6)
-//    [sp+12] => ht     (x7)
-//    [sp+16] => wd     (x8)
+//    w2      => src_strd
+//    w3      => dst_strd
+//    w4      => log_WD
+//    w5      => wt
+//    w6      => ofst
+//    w7      => ht
+//    [sp]    => wd     (w8)
 //
 .text
 .p2align 2
@@ -118,13 +118,14 @@
 
     // STMFD sp!, {x4-x9,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
     ldr       w8, [sp, #80]             //Load wd
     sxtw      x8, w8
 
     dup       v2.4h, w5                 //D2 = wt (16-bit)
-    sub       x20, x4, #0               //x9 = -log_WD
-    neg       x9, x20
+    neg       w9, w4                    //w9 = -log_WD
     dup       v3.8b, w6                 //D3 = ofst (8-bit)
     cmp       w8, #16                   //check if wd is 16
     dup       v0.8h, w9                 //Q0 = -log_WD (16-bit)
@@ -318,22 +319,22 @@
 //                                    UWORD8 *puc_dst,
 //                                    WORD32 src_strd,
 //                                    WORD32 dst_strd,
-//                                    UWORD8 log_WD,
-//                                    UWORD32 wt,
-//                                    UWORD16 ofst,
-//                                    UWORD8 ht,
-//                                    UWORD8 wd)
+//                                    WORD32 log_WD,
+//                                    WORD32 wt,
+//                                    WORD32 ofst,
+//                                    WORD32 ht,
+//                                    WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src
 //    x1      => puc_dst
-//    x2      => src_strd
-//    x3      => dst_strd
-//    [sp]    => log_WD (x4)
-//    [sp+4]  => wt     (x5)
-//   [sp+8]  => ofst   (x6)
-//    [sp+12] => ht     (x7)
-//    [sp+16] => wd     (x8)
+//    w2      => src_strd
+//    w3      => dst_strd
+//    w4      => log_WD
+//    w5      => wt
+//    w6      => ofst
+//    w7      => ht
+//    [sp]    => wd     (w8)
 //
 
 
@@ -345,13 +346,14 @@
 
     // STMFD sp!, {x4-x9,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     ldr       w8, [sp, #80]             //Load wd
     sxtw      x8, w8
 
-    sub       x20, x4, #0               //x9 = -log_WD
-    neg       x9, x20
+    neg       w9, w4                    //w9 = -log_WD
     dup       v2.4s, w5                 //Q1 = {wt_u (16-bit), wt_v (16-bit)}
 
 
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
index 2140b94..5ccf70f 100644
--- a/decoder.arm64.mk
+++ b/decoder.arm64.mk
@@ -6,7 +6,6 @@
 
 libavcd_srcs_c_arm64    += decoder/arm/ih264d_function_selector.c
 
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
 libavcd_srcs_c_arm64    += decoder/arm/ih264d_function_selector_av8.c
 
 libavcd_srcs_asm_arm64    +=  common/armv8/ih264_intra_pred_chroma_av8.s
@@ -34,11 +33,6 @@
 libavcd_srcs_asm_arm64    +=  common/armv8/ih264_intra_pred_luma_8x8_av8.s
 
 libavcd_cflags_arm64 += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC
-else
-libavcd_cflags_arm64 += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
-endif
-
-
 
 
 LOCAL_SRC_FILES_arm64 += $(libavcd_srcs_c_arm64) $(libavcd_srcs_asm_arm64)
diff --git a/decoder/ih264d_api.c b/decoder/ih264d_api.c
index b06623e..bbc7201 100644
--- a/decoder/ih264d_api.c
+++ b/decoder/ih264d_api.c
@@ -2632,6 +2632,9 @@
 
     ps_ctl_op->u4_error_code = 0;
 
+    /* Ignore dangling fields during flush */
+    ps_dec->u1_top_bottom_decoded = 0;
+
     return IV_SUCCESS;
 }
 
@@ -3029,40 +3032,30 @@
         }
     }
 
-    if((0 != ps_dec->u4_app_disp_width)
-                    && (ps_ctl_ip->u4_disp_wd
-                                    != ps_dec->u4_app_disp_width))
+    if(ps_ctl_ip->u4_disp_wd >= ps_dec->u2_pic_wd)
     {
+        ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd;
+    }
+    else if(0 == ps_dec->i4_header_decoded)
+    {
+        ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd;
+    }
+    else if(ps_ctl_ip->u4_disp_wd == 0)
+    {
+        ps_dec->u4_app_disp_width = 0;
+    }
+    else
+    {
+        /*
+         * Set the display width to zero. This will ensure that the wrong value we had stored (0xFFFFFFFF)
+         * does not propogate.
+         */
+        ps_dec->u4_app_disp_width = 0;
         ps_ctl_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
         ps_ctl_op->u4_error_code |= ERROR_DISP_WIDTH_INVALID;
         ret = IV_FAIL;
     }
-    else
-    {
-        if(ps_ctl_ip->u4_disp_wd >= ps_dec->u2_pic_wd)
-        {
-            ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd;
-        }
-        else if(0 == ps_dec->i4_header_decoded)
-        {
-            ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd;
-        }
-        else if(ps_ctl_ip->u4_disp_wd == 0)
-        {
-            ps_dec->u4_app_disp_width = 0;
-        }
-        else
-        {
-            /*
-             * Set the display width to zero. This will ensure that the wrong value we had stored (0xFFFFFFFF)
-             * does not propogate.
-             */
-            ps_dec->u4_app_disp_width = 0;
-            ps_ctl_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
-            ps_ctl_op->u4_error_code |= ERROR_DISP_WIDTH_INVALID;
-            ret = IV_FAIL;
-        }
-    }
+
     if(ps_ctl_ip->e_vid_dec_mode == IVD_DECODE_FRAME)
         ps_dec->i4_decode_header = 0;
     else if(ps_ctl_ip->e_vid_dec_mode == IVD_DECODE_HEADER)
diff --git a/decoder/ih264d_dpb_mgr.c b/decoder/ih264d_dpb_mgr.c
index a75aeee..6261986 100644
--- a/decoder/ih264d_dpb_mgr.c
+++ b/decoder/ih264d_dpb_mgr.c
@@ -17,9 +17,10 @@
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
+#ifdef __ANDROID__
 #include "log/log.h"
 #include <cutils/log.h>
-
+#endif
 #include "ih264_typedefs.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
@@ -886,8 +887,10 @@
                 {
                     if (j >= MAX_REF_BUFS)
                     {
+#ifdef __ANDROID__
                         ALOGE("b/25818142");
                         android_errorWriteLog(0x534e4554, "25818142");
+#endif
                         ps_dpb_cmds->u1_num_of_commands = 0;
                         return -1;
                     }
diff --git a/decoder/ih264d_parse_islice.c b/decoder/ih264d_parse_islice.c
index 0312060..b94f5a4 100644
--- a/decoder/ih264d_parse_islice.c
+++ b/decoder/ih264d_parse_islice.c
@@ -509,9 +509,7 @@
         MEMSET_16BYTES(&ps_dec->pu1_left_mv_ctxt_inc[0][0], 0);
         *((UWORD32 *)ps_dec->pi1_left_ref_idx_ctxt_inc) = 0;
         MEMSET_16BYTES(p_curr_ctxt->u1_mv, 0);
-        pi1_buf = p_curr_ctxt->i1_ref_idx;
-        pi4_buf = (WORD32 *)pi1_buf;
-        *pi4_buf = 0;
+        memset(p_curr_ctxt->i1_ref_idx, 0, 4);
     }
 
     if(u1_mb_type == I_4x4_MB)
diff --git a/decoder/ih264d_parse_mb_header.c b/decoder/ih264d_parse_mb_header.c
index f30ad67..9a6a1f9 100644
--- a/decoder/ih264d_parse_mb_header.c
+++ b/decoder/ih264d_parse_mb_header.c
@@ -1172,7 +1172,6 @@
     /***************************************************************/
     /* Store abs_mvd_values cabac contexts                         */
     /***************************************************************/
-#ifndef ARM
     {
         UWORD8 u1_i;
         for(u1_i = 0; u1_i < u1_part_wd; u1_i++, pu1_top_mv_ctxt += 4)
@@ -1187,46 +1186,6 @@
             pu1_lft_mv_ctxt[1] = u1_abs_mvd_y;
         }
     }
-#else
-    /* Optimising the loop, with Little-Endian Assumption */
-    {
-        UWORD16 *pu2_top_cxt = (UWORD16 *)pu1_top_mv_ctxt;
-        UWORD16 *pu2_lft_cxt = (UWORD16 *)pu1_lft_mv_ctxt;
-        UWORD16 u2_pack_mvd = (UWORD16)((u1_abs_mvd_y << 8) | u1_abs_mvd_x);
-        UWORD8 u1_wd = u1_part_wd, u1_ht = u1_part_ht;
-
-        u1_wd--;
-        *pu2_top_cxt = u2_pack_mvd;
-        pu2_top_cxt += 2;
-        if(u1_wd)
-        {
-            u1_wd--;
-            *pu2_top_cxt = u2_pack_mvd;
-            pu2_top_cxt += 2;
-        }
-        if(u1_wd)
-        {
-            *pu2_top_cxt = u2_pack_mvd;
-            pu2_top_cxt += 2;
-            *pu2_top_cxt = u2_pack_mvd;
-        }
-        u1_ht--;
-        *pu2_lft_cxt = u2_pack_mvd;
-        pu2_lft_cxt += 2;
-        if(u1_ht)
-        {
-            u1_ht--;
-            *pu2_lft_cxt = u2_pack_mvd;
-            pu2_lft_cxt += 2;
-        }
-        if(u1_ht)
-        {
-            *pu2_lft_cxt = u2_pack_mvd;
-            pu2_lft_cxt += 2;
-            *pu2_lft_cxt = u2_pack_mvd;
-        }
-    }
-#endif
 }
 
 /*****************************************************************************/
diff --git a/decoder/ih264d_process_intra_mb.c b/decoder/ih264d_process_intra_mb.c
index 279ff87..211d796 100644
--- a/decoder/ih264d_process_intra_mb.c
+++ b/decoder/ih264d_process_intra_mb.c
@@ -930,7 +930,9 @@
             }
         }
         {
-            UWORD8 au1_ngbr_pels[33];
+            /* Align the size to multiple of 8, so that SIMD functions
+               can read 64 bits at a time. Only 33 bytes are actaully used */
+            UWORD8 au1_ngbr_pels[40];
             /* Get neighbour pixels */
             /* left pels */
             if(u2_use_left_mb)
@@ -1175,7 +1177,9 @@
         /* Scan the sub-blocks in Raster Scan Order */
         for(u1_sub_mb_num = 0; u1_sub_mb_num < 16; u1_sub_mb_num++)
         {
-            UWORD8 au1_ngbr_pels[13];
+            /* Align the size to multiple of 8, so that SIMD functions
+               can read 64 bits at a time. Only 13 bytes are actaully used */
+            UWORD8 au1_ngbr_pels[16];
 
             u1_sub_blk_x = u1_sub_mb_num & 0x3;
             u1_sub_blk_y = u1_sub_mb_num >> 2;
@@ -1664,7 +1668,9 @@
             }
 
             {
-                UWORD8 au1_ngbr_pels[25];
+                /* Align the size to multiple of 8, so that SIMD functions
+                can read 64 bits at a time. Only 25 bytes are actaully used */
+                UWORD8 au1_ngbr_pels[32];
                 WORD32 ngbr_avail;
                 ngbr_avail = u1_is_left_sub_block << 0;
                 ngbr_avail |= u1_is_top_sub_block << 2;
diff --git a/decoder/ih264d_structs.h b/decoder/ih264d_structs.h
index 4260393..d8b02da 100644
--- a/decoder/ih264d_structs.h
+++ b/decoder/ih264d_structs.h
@@ -1057,7 +1057,6 @@
     prev_seq_params_t s_prev_seq_params;
     UWORD8 u1_cur_mb_fld_dec_flag; /* current Mb fld or Frm */
 
-    WORD8 pi1_left_pred_mode[8];
     UWORD8 u1_topleft_mb_fld;
     UWORD8 u1_topleft_mbtype;
     UWORD8 u1_topleft_mb_fld_bot;
@@ -1067,6 +1066,9 @@
     UWORD16 u2_top_left_mask;
     UWORD16 u2_top_right_mask;
     dec_err_status_t * ps_dec_err_status;
+    /* Ensure pi1_left_pred_mode is aligned to 4 byte boundary,
+    by declaring this after a pointer or an integer */
+    WORD8 pi1_left_pred_mode[8];
 
     UWORD8 u1_mb_idx_mv;
     UWORD16 u2_mv_2mb[2];
diff --git a/decoder/ih264d_utils.c b/decoder/ih264d_utils.c
index bc5c484..a177a18 100644
--- a/decoder/ih264d_utils.c
+++ b/decoder/ih264d_utils.c
@@ -1893,6 +1893,10 @@
     RETURN_IF((NULL == pv_buf), IV_FAIL);
     ps_dec->p_ctxt_inc_mb_map = pv_buf;
 
+    /* 0th entry of CtxtIncMbMap will be always be containing default values
+     for CABAC context representing MB not available */
+    ps_dec->p_ctxt_inc_mb_map += 1;
+
     size = (sizeof(mv_pred_t) * ps_dec->u1_recon_mb_grp
                         * 16);
     pv_buf = ps_dec->pf_aligned_alloc(pv_mem_ctxt, 128, size);
@@ -2076,9 +2080,6 @@
     RETURN_IF((NULL == pv_buf), IV_FAIL);
     ps_dec->pu1_pic_buf_base = pv_buf;
 
-    /* 0th entry of CtxtIncMbMap will be always be containing default values
-     for CABAC context representing MB not available */
-    ps_dec->p_ctxt_inc_mb_map += 1;
     /* Post allocation Increment Actions */
 
     /***************************************************************************/
diff --git a/encoder.arm64.mk b/encoder.arm64.mk
index f95a29f..73cce1b 100644
--- a/encoder.arm64.mk
+++ b/encoder.arm64.mk
@@ -7,7 +7,6 @@
 
 libavce_srcs_c_arm64    += encoder/arm/ih264e_function_selector.c
 
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
 libavce_srcs_c_arm64    += encoder/arm/ih264e_function_selector_av8.c
 
 libavce_srcs_asm_arm64    +=  common/armv8/ih264_resi_trans_quant_av8.s
@@ -35,12 +34,6 @@
 #ME
 libavce_srcs_asm_arm64    +=  encoder/armv8/ime_distortion_metrics_av8.s
 
-else
-libavce_cflags_arm64 += -DDISABLE_NEON
-endif
-
-
-
 
 LOCAL_SRC_FILES_arm64 += $(libavce_srcs_c_arm64) $(libavce_srcs_asm_arm64)
 LOCAL_C_INCLUDES_arm64 += $(libavce_inc_dir_arm64)
diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
index df06d41..c23a6ea 100644
--- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
@@ -82,9 +82,9 @@
 //x0 = pu1_src,
 //x1 = pu1_ngbr_pels_i16,
 //x2 = pu1_dst,
-//x3 = src_strd,
-//x4 = dst_strd,
-//x5 = u4_n_avblty,
+//w3 = src_strd,
+//w4 = dst_strd,
+//w5 = u4_n_avblty,
 //x6 = u4_intra_mode,
 //x7 = pu4_sadmin
 
@@ -92,9 +92,11 @@
 
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
     stp       x19, x20, [sp, #-16]!
 
-    ldr       x16, [sp, #80]
+    ldr       w16, [sp, #80]
     mov       x17, x4
     mov       x14, x6
     mov       x15, x7
@@ -105,13 +107,13 @@
     mov       w10, #0
     mov       w11 , #3
 
-    ands      x6, x5, #0x01
+    ands      w6, w5, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
     ld1       {v0.16b}, [x1]
     add       w10, w10, #8
     add       w11, w11, #1
 top_available:
-    ands      x6, x5, #0x04
+    ands      w6, w5, #0x04
     beq       none_available
     add       x6, x1, #17
     ld1       {v1.16b}, [x6]
@@ -119,7 +121,7 @@
     add       w11, w11, #1
     b         summation
 none_available:
-    cmp       x5, #0
+    cmp       w5, #0
     bne       summation
     mov       w6, #128
     dup       v30.16b, w6
@@ -469,16 +471,16 @@
     mov       x11, #1
     lsl       x11, x11, #30
 
-    mov       x0, x16
+    mov       w0, w16
     //--------------------------------------------
-    ands      x7, x0, #01               // vert mode valid????????????
+    ands      w7, w0, #01               // vert mode valid????????????
     csel      x8, x11, x8, eq
 
 
-    ands      x6, x0, #02               // horz mode valid????????????
+    ands      w6, w0, #02               // horz mode valid????????????
     csel      x9, x11, x9, eq
 
-    ands      x6, x0, #04               // dc mode valid????????????
+    ands      w6, w0, #04               // dc mode valid????????????
     csel      x10, x11, x10, eq
 
 
diff --git a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
index bb2526d..4014c4f 100644
--- a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s
@@ -82,9 +82,9 @@
 //x0 = pu1_src,
 //x1 = pu1_ngbr_pels_i16,
 //x2 = pu1_dst,
-//x3 = src_strd,
-//x4 = dst_strd,
-//x5 = u4_n_avblty,
+//w3 = src_strd,
+//w4 = dst_strd,
+//w5 = u4_n_avblty,
 //x6 = u4_intra_mode,
 //x7 = pu4_sadmin
 
@@ -92,20 +92,22 @@
 
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
     stp       x19, x20, [sp, #-16]!
     //-----------------------
-    ldr       x16, [sp, #80]
+    ldr       w16, [sp, #80]
     mov       x17, x4
-    mov       x18, x5
+    mov       w18, w5
     mov       x14, x6
     mov       x15, x7
 
-    mov       x19, #5
-    ands      x6, x5, x19
+    mov       w19, #5
+    ands      w6, w5, w19
     beq       none_available
-    cmp       x6, #1
+    cmp       w6, #1
     beq       left_only_available
-    cmp       x6, #4
+    cmp       w6, #4
     beq       top_only_available
 
 all_available:
@@ -368,20 +370,20 @@
 
     mov       x11, #1
 //-----------------------
-    mov       x0, x16 // u4_valid_intra_modes
+    mov       w0, w16 // u4_valid_intra_modes
 
 //--------------------------------------------
 
 
     lsl       x11, x11, #30
 
-    ands      x7, x0, #04               // vert mode valid????????????
+    ands      w7, w0, #04               // vert mode valid????????????
     csel      x8, x11, x8, eq
 
-    ands      x6, x0, #02               // horz mode valid????????????
+    ands      w6, w0, #02               // horz mode valid????????????
     csel      x9, x11, x9, eq
 
-    ands      x6, x0, #01               // dc mode valid????????????
+    ands      w6, w0, #01               // dc mode valid????????????
     csel      x10, x11, x10, eq
 
 
diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s
index 8f27104..cdac8da 100644
--- a/encoder/armv8/ih264e_half_pel_av8.s
+++ b/encoder/armv8/ih264e_half_pel_av8.s
@@ -86,6 +86,8 @@
 ih264e_sixtapfilter_horz_av8:
     // STMFD sp!,{x14}
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     movi      v0.8b, #5
@@ -263,6 +265,8 @@
 ih264e_sixtap_filter_2dvh_vert_av8:
     // STMFD sp!,{x10,x11,x12,x14}
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
     stp       x19, x20, [sp, #-16]!
 
 ////x0 - pu1_ref
diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s
index 47c3425..00d11c0 100644
--- a/encoder/armv8/ime_distortion_metrics_av8.s
+++ b/encoder/armv8/ime_distortion_metrics_av8.s
@@ -95,6 +95,8 @@
     .global ime_compute_sad_16x16_fast_av8
 ime_compute_sad_16x16_fast_av8:
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     lsl       x2, x2, #1
     lsl       x3, x3, #1
 
@@ -179,6 +181,8 @@
     //chheck what stride incremtn to use
     //earlier code did not have this lsl
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     mov       x6, #2
     movi      v30.8h, #0
 
@@ -255,6 +259,8 @@
 ime_compute_sad_16x16_ea8_av8:
 
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     movi      v30.8h, #0
 
     add       x7, x0, x2
@@ -381,9 +387,12 @@
     // x0    = ref1     <UWORD8 *>
     // x1    = ref2     <UWORD8 *>
     // x2    = src     <UWORD8 *>
-    // x3    = RefBufferWidth <UWORD32>
-    // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
+    // w3    = RefBufferWidth <UWORD32>
+    // w4    = CurBufferWidth <UWORD32>
+    // x5    = psad <UWORD32 *>
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
     mov       x6, #8
     movi      v30.8h, #0
     movi      v31.8h, #0
@@ -459,16 +468,15 @@
     // x1    = ref2     <UWORD8 *>
     // x2    = ref3     <UWORD8 *>
     // x3    = src     <UWORD8 *>
-    // stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
+    // w4    = RefBufferWidth <UWORD32>
+    // w5    = CurBufferWidth <UWORD32>
+    // x6    = psad <UWORD32 *>
 
 
-    // x0    = ref1     <UWORD8 *>
-    // x1    = ref2     <UWORD8 *>
-    // x2    = src     <UWORD8 *>
-    // x3    = RefBufferWidth <UWORD32>
-    // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
     push_v_regs
-    mov       x6, #16
+    sxtw      x4, w4
+    sxtw      x5, w5
+    mov       x7, #16
     movi      v29.8h, #0
     movi      v30.8h, #0
     movi      v31.8h, #0
@@ -499,15 +507,15 @@
     uabal     v31.8h, v6.8b, v7.8b
     uabal2    v31.8h, v6.16b, v7.16b
 
-    subs      x6, x6, #1
-    bne       core_loop_ime_calculate_sad2_prog_av8
+    subs      x7, x7, #1
+    bne       core_loop_ime_calculate_sad3_prog_av8
 
     addp      v30.8h, v30.8h, v31.8h
     uaddlp    v30.4s, v30.8h
     addp      v30.2s, v30.2s, v30.2s
     shl       v30.2s, v30.2s, #1
 
-    st1       {v30.2s}, [x5]
+    st1       {v30.2s}, [x6]
     pop_v_regs
     ret
 
@@ -544,6 +552,8 @@
     .global ime_sub_pel_compute_sad_16x16_av8
 ime_sub_pel_compute_sad_16x16_av8:
     push_v_regs
+    sxtw      x4, w4
+    sxtw      x5, w5
     sub       x7, x1, #1                //x left
     sub       x8, x2, x5                //y top
     sub       x9, x3, #1                //xy  left
@@ -647,6 +657,8 @@
     .global ime_compute_sad_16x16_av8
 ime_compute_sad_16x16_av8:
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     mov       x6, #4
     movi      v30.8h, #0
 
@@ -702,6 +714,8 @@
     .global ime_calculate_sad4_prog_av8
 ime_calculate_sad4_prog_av8:
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     sub       x5, x0, #1                //left
     add       x6, x0, #1                //right
     sub       x7, x0, x2                //top
@@ -777,13 +791,15 @@
 ime_compute_satqd_16x16_lumainter_av8:
     //x0 :pointer to src buffer
     //x1 :pointer to est buffer
-    //x2 :Source stride
-    //x3 :Pred stride
+    //w2 :Source stride
+    //w3 :Pred stride
     //x4 :Threshold pointer
     //x5 :Distortion,ie SAD
     //x6 :is nonzero
     //x7 :loop counter
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       d8, d9, [sp, #-16]!
     stp       d10, d11, [sp, #-16]!
     stp       d12, d13, [sp, #-16]!
diff --git a/encoder/ih264e_api.c b/encoder/ih264e_api.c
index e0c9f83..2ecfdf5 100644
--- a/encoder/ih264e_api.c
+++ b/encoder/ih264e_api.c
@@ -3823,7 +3823,7 @@
         UWORD8 *pu1_buf = ps_mem_rec->pv_base;
 
         /* size of header data of 1 mb */
-        size = 40;
+        size = sizeof(mb_hdr_t);
 
         /* size for 1 row of mbs */
         size = size * max_mb_cols;
diff --git a/encoder/ih264e_cabac_encode.c b/encoder/ih264e_cabac_encode.c
index ecc30f5..e49ab58 100644
--- a/encoder/ih264e_cabac_encode.c
+++ b/encoder/ih264e_cabac_encode.c
@@ -339,7 +339,7 @@
     for (i = 0; i < 16; i += 2)
     {
         /* sub blk idx 1 */
-        byte = *pu1_intra_4x4_modes++;
+        byte = pu1_intra_4x4_modes[i >> 1];
         if (byte & 0x1)
         {
             ih264e_cabac_encode_bin(ps_cabac_ctxt,
@@ -1540,14 +1540,14 @@
             u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[1];
             u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[0];
             u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[1];
-            u2_mv = *(pi2_mv_ptr++);
+            u2_mv = pi2_mv_ptr[0];
 
             ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X,
                                     (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b),
                                     ps_cabac_ctxt);
 
             u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv));
-            u2_mv = *(pi2_mv_ptr++);
+            u2_mv = pi2_mv_ptr[1];
 
             ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y,
                                     (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b),
@@ -1555,6 +1555,7 @@
 
             u1_abs_mvd_y = CLIP3(0, 127, ABS(u2_mv));
         }
+
         /***************************************************************/
         /* Store abs_mvd_values cabac contexts                         */
         /***************************************************************/
@@ -1571,14 +1572,14 @@
             u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[3];
             u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[2];
             u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[3];
-            u2_mv = *(pi2_mv_ptr++);
+            u2_mv = pi2_mv_ptr[2];
 
             ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X,
                                     (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b),
                                     ps_cabac_ctxt);
 
             u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv));
-            u2_mv = *(pi2_mv_ptr++);
+            u2_mv = pi2_mv_ptr[3];
 
             ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y,
                                     (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b),
@@ -1624,11 +1625,11 @@
     cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac;
     /* packed header data */
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
     mb_info_ctxt_t *ps_curr_ctxt;
     WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
     WORD8 mb_qp_delta;
     UWORD32 u4_cbp_l, u4_cbp_c;
-    WORD32 byte_count = 0;
     WORD32 bitstream_start_offset, bitstream_end_offset;
 
     if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
@@ -1638,12 +1639,10 @@
         return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
     }
     /* mb header info */
-    mb_tpm = *pu1_byte++;
-    byte_count++;
-    cbp = *pu1_byte++;
-    byte_count++;
-    mb_qp_delta = *pu1_byte++;
-    byte_count++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
+    cbp = ps_mb_hdr->u1_cbp;
+    mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
+
     /* mb type */
     mb_type = mb_tpm & 0xF;
 
@@ -1671,9 +1670,10 @@
                                    MB_TYPE_I_SLICE);
 
     if (mb_type == I4x4)
-    {   /* Encode 4x4 MB modes */
-        ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
-        byte_count += 8;
+    {
+        /* Encode 4x4 MB modes */
+        mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+        ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes);
     }
     /* Encode chroma mode */
     ih264e_cabac_enc_chroma_predmode(chroma_intra_mode, ps_cabac_ctxt);
@@ -1731,17 +1731,18 @@
     memset(ps_curr_ctxt->u1_mv, 0, 16);
     memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16);
     ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_cbp = cbp;
-    ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+
     if (mb_type == I16x16)
     {
         ps_curr_ctxt->u1_mb_type = CAB_I16x16;
-
+        pu1_byte += sizeof(mb_hdr_i16x16_t);
     }
     else
     {
         ps_curr_ctxt->u1_mb_type = CAB_I4x4;
-
+        pu1_byte += sizeof(mb_hdr_i4x4_t);
     }
+    ps_ent_ctxt->pv_mb_header_data = pu1_byte;
     return IH264E_SUCCESS;
 }
 
@@ -1778,8 +1779,8 @@
     WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
     WORD8 mb_qp_delta;
     UWORD32 u4_cbp_l, u4_cbp_c;
-    WORD32 byte_count = 0;
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
 
     if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
                     >= ps_bitstream->u4_max_strm_size)
@@ -1788,8 +1789,7 @@
         return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
     }
     /* mb header info */
-    mb_tpm = *pu1_byte++;
-    byte_count++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
 
     /* mb type */
     mb_type = mb_tpm & 0xF;
@@ -1800,10 +1800,8 @@
     /* if Intra MB */
     if (mb_type == I16x16 || mb_type == I4x4)
     {
-        cbp = *pu1_byte++;
-        byte_count++;
-        mb_qp_delta = *pu1_byte++;
-        byte_count++;
+        cbp = ps_mb_hdr->u1_cbp;
+        mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
         /* Starting bitstream offset for header in bits */
         bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
@@ -1833,9 +1831,10 @@
         }
 
         if (mb_type == I4x4)
-        {   /* Intra 4x4 modes */
-            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
-            byte_count += 8;
+        {
+            /* Intra 4x4 modes */
+            mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes);
         }
         chroma_intra_mode = (mb_tpm >> 6);
 
@@ -1901,13 +1900,15 @@
         if (mb_type == I16x16)
         {
             ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+            pu1_byte += sizeof(mb_hdr_i16x16_t);
         }
         else
         {
             ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+            pu1_byte += sizeof(mb_hdr_i4x4_t);
         }
 
-        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        ps_ent_ctxt->pv_mb_header_data = pu1_byte;
 
         return IH264E_SUCCESS;
     }
@@ -1918,10 +1919,9 @@
         /* Encoding P16x16 */
         if (mb_type != PSKIP)
         {
-            cbp = *pu1_byte++;
-            byte_count++;
-            mb_qp_delta = *pu1_byte++;
-            byte_count++;
+            mb_hdr_p16x16_t *ps_mb_hdr_p16x16 = (mb_hdr_p16x16_t *)ps_ent_ctxt->pv_mb_header_data;
+            cbp = ps_mb_hdr->u1_cbp;
+            mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
             /* Encoding mb_skip */
             ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_P_SLICE);
@@ -1937,8 +1937,8 @@
             }
             ps_curr_ctxt->u1_mb_type = CAB_P;
             {
-                WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte;
-                byte_count += 4;
+                WORD16 *pi2_mv_ptr = (WORD16 *) ps_mb_hdr_p16x16->ai2_mv;
+
                 ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type
                                             | CAB_NON_BD16x16);
                  /* Encoding motion vector for P16x16 */
@@ -1960,6 +1960,8 @@
             /* Starting bitstream offset for residue */
             bitstream_start_offset = bitstream_end_offset;
 
+            pu1_byte += sizeof(mb_hdr_p16x16_t);
+
         }
         else/* MB = PSKIP */
         {
@@ -1978,6 +1980,7 @@
                             - bitstream_start_offset;
             /* Starting bitstream offset for residue */
 
+            pu1_byte += sizeof(mb_hdr_pskip_t);
         }
 
         if (cbp > 0)
@@ -2002,7 +2005,8 @@
         }
         ps_curr_ctxt->u1_intrapred_chroma_mode = 0;
         ps_curr_ctxt->u1_cbp = cbp;
-        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        ps_ent_ctxt->pv_mb_header_data = pu1_byte;
+
         return IH264E_SUCCESS;
     }
 }
@@ -2066,8 +2070,8 @@
     WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode;
     WORD8 mb_qp_delta;
     UWORD32 u4_cbp_l, u4_cbp_c;
-    WORD32 byte_count = 0;
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
 
     if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB)
                     >= ps_bitstream->u4_max_strm_size)
@@ -2076,8 +2080,7 @@
         return (IH264E_BITSTREAM_BUFFER_OVERFLOW);
     }
     /* mb header info */
-    mb_tpm = *pu1_byte++;
-    byte_count++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
 
     /* mb type */
     mb_type = mb_tpm & 0xF;
@@ -2088,10 +2091,8 @@
     /* if Intra MB */
     if (mb_type == I16x16 || mb_type == I4x4)
     {
-        cbp = *pu1_byte++;
-        byte_count++;
-        mb_qp_delta = *pu1_byte++;
-        byte_count++;
+        cbp = ps_mb_hdr->u1_cbp;
+        mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
         /* Starting bitstream offset for header in bits */
         bitstream_start_offset = GET_NUM_BITS(ps_bitstream);
@@ -2138,9 +2139,10 @@
         }
 
         if (mb_type == I4x4)
-        { /* Intra 4x4 modes */
-            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte);
-            byte_count += 8;
+        {
+            /* Intra 4x4 modes */
+            mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+            ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes);
         }
         chroma_intra_mode = (mb_tpm >> 6);
 
@@ -2206,13 +2208,15 @@
         if (mb_type == I16x16)
         {
             ps_curr_ctxt->u1_mb_type = CAB_I16x16;
+            pu1_byte += sizeof(mb_hdr_i16x16_t);
         }
         else
         {
             ps_curr_ctxt->u1_mb_type = CAB_I4x4;
+            pu1_byte += sizeof(mb_hdr_i4x4_t);
         }
 
-        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        ps_ent_ctxt->pv_mb_header_data = pu1_byte;
 
         return IH264E_SUCCESS;
     }
@@ -2224,10 +2228,9 @@
         /* Encoding B_Direct_16x16 */
         if (mb_type == BDIRECT)
         {
-            cbp = *pu1_byte++;
-            byte_count++;
-            mb_qp_delta = *pu1_byte++;
-            byte_count++;
+            cbp = ps_mb_hdr->u1_cbp;
+            mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
+
 
             /* Encoding mb_skip */
             ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
@@ -2275,6 +2278,7 @@
             bitstream_start_offset = bitstream_end_offset;
             /* Starting bitstream offset for residue */
 
+            pu1_byte += sizeof(mb_hdr_bdirect_t);
         }
 
         else if (mb_type == BSKIP)/* MB = BSKIP */
@@ -2293,17 +2297,18 @@
                             - bitstream_start_offset;
             /* Starting bitstream offset for residue */
 
+            pu1_byte += sizeof(mb_hdr_bskip_t);
         }
 
         else /* mbype is B_L0_16x16, B_L1_16x16 or B_Bi_16x16 */
         {
+            mb_hdr_b16x16_t *ps_mb_hdr_b16x16 = (mb_hdr_b16x16_t *)ps_ent_ctxt->pv_mb_header_data;
+
             WORD32 i4_mb_part_pred_mode = (mb_tpm >> 4);
             UWORD32 u4_mb_type = mb_type - B16x16 + B_L0_16x16
                             + i4_mb_part_pred_mode;
-            cbp = *pu1_byte++;
-            byte_count++;
-            mb_qp_delta = *pu1_byte++;
-            byte_count++;
+            cbp = ps_mb_hdr->u1_cbp;
+            mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
             /* Encoding mb_skip */
             ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE);
@@ -2338,11 +2343,9 @@
 
             ps_curr_ctxt->u1_mb_type = CAB_NON_BD16x16;
             {
-                WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte;
+                WORD16 *pi2_mv_ptr = (WORD16 *) ps_mb_hdr_b16x16->ai2_mv;
+
                 /* Get the pred modes */
-
-                byte_count += 4 * (1 + (i4_mb_part_pred_mode == PRED_BI));
-
                 ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type
                                 | CAB_NON_BD16x16);
                 /* Encoding motion vector for B16x16 */
@@ -2364,6 +2367,8 @@
                             - bitstream_start_offset;
             /* Starting bitstream offset for residue */
             bitstream_start_offset = bitstream_end_offset;
+
+            pu1_byte += sizeof(mb_hdr_b16x16_t);
         }
 
         if (cbp > 0)
@@ -2388,7 +2393,7 @@
         }
         ps_curr_ctxt->u1_intrapred_chroma_mode = 0;
         ps_curr_ctxt->u1_cbp = cbp;
-        ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count;
+        ps_ent_ctxt->pv_mb_header_data = pu1_byte;
         return IH264E_SUCCESS;
     }
 }
diff --git a/encoder/ih264e_cabac_init.c b/encoder/ih264e_cabac_init.c
index 347842c..7407dcc 100644
--- a/encoder/ih264e_cabac_init.c
+++ b/encoder/ih264e_cabac_init.c
@@ -160,17 +160,13 @@
         /* 0th entry of mb_map_ctxt_inc will be always be containing default values */
         /* for CABAC context representing MB not available                       */
         mb_info_ctxt_t *ps_def_ctxt = ps_cabac_ctxt->ps_mb_map_ctxt_inc - 1;
-        UWORD32 *pu4_temp;
-        WORD8 i;
 
         ps_def_ctxt->u1_mb_type = CAB_SKIP;
         ps_def_ctxt->u1_cbp = 0x0f;
         ps_def_ctxt->u1_intrapred_chroma_mode = 0;
-        pu4_temp = (UWORD32 *)ps_def_ctxt->i1_ref_idx;
-        pu4_temp[0] = 0;
-        pu4_temp = (UWORD32 *)ps_def_ctxt->u1_mv;
-        for (i = 0; i < 4; i++, pu4_temp++)
-            (*pu4_temp) = 0;
+
+        memset(ps_def_ctxt->i1_ref_idx, 0, sizeof(ps_def_ctxt->i1_ref_idx));
+        memset(ps_def_ctxt->u1_mv, 0, sizeof(ps_def_ctxt->u1_mv));
         ps_cabac_ctxt->ps_def_ctxt_mb_info = ps_def_ctxt;
     }
 }
diff --git a/encoder/ih264e_cavlc.c b/encoder/ih264e_cavlc.c
index 7491480..ed34a43 100644
--- a/encoder/ih264e_cavlc.c
+++ b/encoder/ih264e_cavlc.c
@@ -959,6 +959,7 @@
 
     /* packed header data */
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
 
     /* mb header info */
     /*
@@ -986,9 +987,9 @@
     /********************************************************************/
 
     /* mb header info */
-    mb_tpm = *pu1_byte++;
-    cbp = *pu1_byte++;
-    mb_qp_delta = *pu1_byte++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
+    cbp = ps_mb_hdr->u1_cbp;
+    mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
     /* mb type */
     mb_type = mb_tpm & 0xF;
@@ -1009,9 +1010,13 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+
+        pu1_byte += sizeof(mb_hdr_i16x16_t);
     }
     else if (mb_type == I4x4)
     {
+        mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* mb sub blk modes */
         WORD32 intra_pred_mode_flag, rem_intra_mode;
         WORD32 byte;
@@ -1024,7 +1029,7 @@
         for (i = 0; i < 16; i += 2)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1056,11 +1061,14 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+
+        pu1_byte += sizeof(mb_hdr_i4x4_t);
     }
     else if (mb_type == I8x8)
     {
         /* transform 8x8 flag */
         UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag;
+        mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data;
 
         /* mb sub blk modes */
         WORD32 intra_pred_mode_flag, rem_intra_mode;
@@ -1080,7 +1088,7 @@
         for (i = 0; i < 4; i++)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1112,6 +1120,8 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+
+        pu1_byte += sizeof(mb_hdr_i8x8_t);
     }
     else
     {
@@ -1181,6 +1191,7 @@
 
     /* packed header data */
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
 
     /* mb header info */
     /*
@@ -1211,7 +1222,7 @@
     /********************************************************************/
 
     /* mb header info */
-    mb_tpm = *pu1_byte++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
 
     /* mb type */
     mb_type = mb_tpm & 0xF;
@@ -1227,6 +1238,7 @@
         (*ps_ent_ctxt->pi4_mb_skip_run)++;
 
         /* store the index of the next mb syntax layer */
+        pu1_byte += sizeof(mb_hdr_pskip_t);
         ps_ent_ctxt->pv_mb_header_data = pu1_byte;
 
         /* set nnz to zero */
@@ -1248,8 +1260,8 @@
     }
 
     /* remaining mb header info */
-    cbp = *pu1_byte++;
-    mb_qp_delta = *pu1_byte++;
+    cbp = ps_mb_hdr->u1_cbp;
+    mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
     /* mb skip run */
     PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run");
@@ -1278,9 +1290,12 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+        pu1_byte += sizeof(mb_hdr_i16x16_t);
     }
     else if (mb_type == I4x4)
     {
+        mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* mb sub blk modes */
         WORD32 intra_pred_mode_flag, rem_intra_mode;
         WORD32 byte;
@@ -1296,7 +1311,7 @@
         for (i = 0; i < 16; i += 2)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1328,9 +1343,13 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+
+        pu1_byte += sizeof(mb_hdr_i4x4_t);
     }
     else if (mb_type == I8x8)
     {
+        mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* transform 8x8 flag */
         UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag;
 
@@ -1355,7 +1374,7 @@
         for (i = 0; i < 4; i++)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1387,14 +1406,18 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+
+        pu1_byte += sizeof(mb_hdr_i8x8_t);
     }
     else
     {
+        mb_hdr_p16x16_t *ps_mb_hdr_p16x16 = (mb_hdr_p16x16_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* inter macro block partition cnt */
         const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 };
 
         /* mv ptr */
-        WORD16 *pi2_mv_ptr = (WORD16 *)pu1_byte;
+        WORD16 *pi2_mv_ptr = (WORD16 *)ps_mb_hdr_p16x16->ai2_mv;
 
         /* number of partitions for the current mb */
         UWORD32 u4_part_cnt = au1_part_cnt[mb_type - 3];
@@ -1410,7 +1433,8 @@
             PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv y");
         }
 
-        pu1_byte = (UWORD8 *)pi2_mv_ptr;
+        pu1_byte += sizeof(mb_hdr_p16x16_t);
+
     }
 
     /* coded_block_pattern */
@@ -1479,6 +1503,7 @@
 
     /* packed header data */
     UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data;
+    mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data;
 
     /* mb header info */
     /*
@@ -1508,7 +1533,7 @@
     /*                    BEGIN HEADER GENERATION                       */
     /********************************************************************/
 
-    mb_tpm = *pu1_byte++;
+    mb_tpm = ps_mb_hdr->u1_mb_type_mode;
 
     /* mb type */
     mb_type = mb_tpm & 0xF;
@@ -1524,6 +1549,7 @@
         (*ps_ent_ctxt->pi4_mb_skip_run)++;
 
         /* store the index of the next mb syntax layer */
+        pu1_byte += sizeof(mb_hdr_bskip_t);
         ps_ent_ctxt->pv_mb_header_data = pu1_byte;
 
         /* set nnz to zero */
@@ -1547,8 +1573,8 @@
 
 
     /* remaining mb header info */
-    cbp = *pu1_byte++;
-    mb_qp_delta = *pu1_byte++;
+    cbp = ps_mb_hdr->u1_cbp;
+    mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta;
 
     /* mb skip run */
     PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run");
@@ -1577,9 +1603,13 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+        pu1_byte += sizeof(mb_hdr_i16x16_t);
+
     }
     else if (mb_type == I4x4)
     {
+        mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* mb sub blk modes */
         WORD32 intra_pred_mode_flag, rem_intra_mode;
         WORD32 byte;
@@ -1595,7 +1625,7 @@
         for (i = 0; i < 16; i += 2)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1627,9 +1657,13 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+        pu1_byte += sizeof(mb_hdr_i4x4_t);
+
     }
     else if (mb_type == I8x8)
     {
+        mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* transform 8x8 flag */
         UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag;
 
@@ -1654,7 +1688,7 @@
         for (i = 0; i < 4; i++)
         {
             /* sub blk idx 1 */
-            byte = *pu1_byte++;
+            byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1];
 
             intra_pred_mode_flag = byte & 0x1;
 
@@ -1686,21 +1720,24 @@
 
         /* intra_chroma_pred_mode */
         PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode");
+        pu1_byte += sizeof(mb_hdr_i8x8_t);
+
     }
     else if(mb_type == BDIRECT)
     {
         is_inter = 1;
         /* write mb type */
         PUT_BITS_UEV(ps_bitstream, B_DIRECT_16x16, error_status, "mb type");
+        pu1_byte += sizeof(mb_hdr_bdirect_t);
+
     }
     else /* if mb_type == B16x16 */
     {
+        mb_hdr_b16x16_t *ps_mb_hdr_b16x16 = (mb_hdr_b16x16_t *)ps_ent_ctxt->pv_mb_header_data;
+
         /* inter macro block partition cnt for 16x16 16x8 8x16 8x8 */
         const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 };
 
-        /* mv ptr */
-        WORD16 *pi2_mvd_ptr = (WORD16 *)pu1_byte;
-
         /* number of partitions for the current mb */
         UWORD32 u4_part_cnt = au1_part_cnt[mb_type - B16x16];
 
@@ -1718,21 +1755,17 @@
         {
             if (i4_mb_part_pred_mode != PRED_L1)/* || PRED_BI */
             {
-                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l0 x");
-                pi2_mvd_ptr++;
-                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l0 y");
-                pi2_mvd_ptr++;
+                PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[0][0], error_status, "mv l0 x");
+                PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[0][1], error_status, "mv l0 y");
             }
             if (i4_mb_part_pred_mode != PRED_L0)/* || PRED_BI */
             {
-                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l1 x");
-                pi2_mvd_ptr++;
-                PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l1 y");
-                pi2_mvd_ptr++;
+                PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[1][0], error_status, "mv l1 x");
+                PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[1][1], error_status, "mv l1 y");
             }
         }
 
-        pu1_byte = (UWORD8 *)pi2_mvd_ptr;
+        pu1_byte += sizeof(mb_hdr_b16x16_t);
     }
 
     /* coded_block_pattern */
diff --git a/encoder/ih264e_encode_header.c b/encoder/ih264e_encode_header.c
index 04bdc14..3626a63 100644
--- a/encoder/ih264e_encode_header.c
+++ b/encoder/ih264e_encode_header.c
@@ -1129,7 +1129,14 @@
     }
 
     /* direct_8x8_inference_flag */
-    ps_sps->i1_direct_8x8_inference_flag = 0;
+    if (ps_sps->u1_level_idc < IH264_LEVEL_30)
+    {
+        ps_sps->i1_direct_8x8_inference_flag = 0;
+    }
+    else
+    {
+        ps_sps->i1_direct_8x8_inference_flag = 1;
+    }
 
     /* cropping params */
     /*NOTE : Cropping values depend on the chroma format
diff --git a/encoder/ih264e_process.c b/encoder/ih264e_process.c
index 796c983..5fb0b88 100644
--- a/encoder/ih264e_process.c
+++ b/encoder/ih264e_process.c
@@ -652,18 +652,19 @@
     {
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_i4x4_t *ps_mb_hdr = (mb_hdr_i4x4_t *)ps_proc->pv_mb_header_data;
 
         /* temp var */
         WORD32 i4, byte;
 
         /* mb type plus mode */
-        *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + u4_mb_type;
+        ps_mb_hdr->common.u1_mb_type_mode = (ps_proc->u1_c_i8_mode << 6) + u4_mb_type;
 
         /* cbp */
-        *pu1_ptr++ = ps_proc->u4_cbp;
+        ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp;
 
         /* mb qp delta */
-        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+        ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
 
         /* sub mb modes */
         for (i4 = 0; i4 < 16; i4 ++)
@@ -710,63 +711,66 @@
                 }
             }
 
-            *pu1_ptr++ = byte;
+            ps_mb_hdr->au1_sub_blk_modes[i4 >> 1] =  byte;
         }
 
         /* end of mb layer */
+        pu1_ptr += sizeof(mb_hdr_i4x4_t);
         ps_proc->pv_mb_header_data = pu1_ptr;
     }
     else if (u4_mb_type == I16x16)
     {
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_i16x16_t *ps_mb_hdr = (mb_hdr_i16x16_t *)ps_proc->pv_mb_header_data;
 
         /* mb type plus mode */
-        *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + (ps_proc->u1_l_i16_mode << 4) + u4_mb_type;
+        ps_mb_hdr->common.u1_mb_type_mode = (ps_proc->u1_c_i8_mode << 6) + (ps_proc->u1_l_i16_mode << 4) + u4_mb_type;
 
         /* cbp */
-        *pu1_ptr++ = ps_proc->u4_cbp;
+        ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp;
 
         /* mb qp delta */
-        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+        ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
 
         /* end of mb layer */
+        pu1_ptr += sizeof(mb_hdr_i16x16_t);
         ps_proc->pv_mb_header_data = pu1_ptr;
     }
     else if (u4_mb_type == P16x16)
     {
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_p16x16_t *ps_mb_hdr = (mb_hdr_p16x16_t *)ps_proc->pv_mb_header_data;
 
-        WORD16 *i2_mv_ptr;
-
-        /* mb type plus mode */
-        *pu1_ptr++ = u4_mb_type;
+        /* mb type */
+        ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type;
 
         /* cbp */
-        *pu1_ptr++ = ps_proc->u4_cbp;
+        ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp;
 
         /* mb qp delta */
-        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+        ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
 
-        i2_mv_ptr = (WORD16 *)pu1_ptr;
+        ps_mb_hdr->ai2_mv[0] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
 
-        *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
-
-        *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
+        ps_mb_hdr->ai2_mv[1] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
 
         /* end of mb layer */
-        ps_proc->pv_mb_header_data = i2_mv_ptr;
+        pu1_ptr += sizeof(mb_hdr_p16x16_t);
+        ps_proc->pv_mb_header_data = pu1_ptr;
     }
     else if (u4_mb_type == PSKIP)
     {
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_pskip_t *ps_mb_hdr = (mb_hdr_pskip_t *)ps_proc->pv_mb_header_data;
 
-        /* mb type plus mode */
-        *pu1_ptr++ = u4_mb_type;
+        /* mb type */
+        ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type;
 
         /* end of mb layer */
+        pu1_ptr += sizeof(mb_hdr_pskip_t);
         ps_proc->pv_mb_header_data = pu1_ptr;
     }
     else if(u4_mb_type == B16x16)
@@ -774,58 +778,59 @@
 
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
-
-        WORD16 *i2_mv_ptr;
+        mb_hdr_b16x16_t *ps_mb_hdr = (mb_hdr_b16x16_t *)ps_proc->pv_mb_header_data;
 
         UWORD32 u4_pred_mode = ps_proc->ps_pu->b2_pred_mode;
 
         /* mb type plus mode */
-        *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type;
+        ps_mb_hdr->common.u1_mb_type_mode = (u4_pred_mode << 4) + u4_mb_type;
 
         /* cbp */
-        *pu1_ptr++ = ps_proc->u4_cbp;
+        ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp;
 
         /* mb qp delta */
-        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+        ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
 
         /* l0 & l1 me data */
-        i2_mv_ptr = (WORD16 *)pu1_ptr;
-
         if (u4_pred_mode != PRED_L1)
         {
-            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx
+            ps_mb_hdr->ai2_mv[0][0] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx
                             - ps_proc->ps_pred_mv[0].s_mv.i2_mvx;
 
-            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy
+            ps_mb_hdr->ai2_mv[0][1] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy
                             - ps_proc->ps_pred_mv[0].s_mv.i2_mvy;
         }
         if (u4_pred_mode != PRED_L0)
         {
-            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx
+            ps_mb_hdr->ai2_mv[1][0] = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx
                             - ps_proc->ps_pred_mv[1].s_mv.i2_mvx;
 
-            *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy
+            ps_mb_hdr->ai2_mv[1][1] = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy
                             - ps_proc->ps_pred_mv[1].s_mv.i2_mvy;
         }
 
         /* end of mb layer */
-        ps_proc->pv_mb_header_data = i2_mv_ptr;
+        pu1_ptr += sizeof(mb_hdr_b16x16_t);
+        ps_proc->pv_mb_header_data = pu1_ptr;
 
     }
     else if(u4_mb_type == BDIRECT)
     {
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_bdirect_t *ps_mb_hdr = (mb_hdr_bdirect_t *)ps_proc->pv_mb_header_data;
 
         /* mb type plus mode */
-        *pu1_ptr++ = u4_mb_type;
+        ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type;
 
         /* cbp */
-        *pu1_ptr++ = ps_proc->u4_cbp;
+        ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp;
 
         /* mb qp delta */
-        *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
+        ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev;
 
+        /* end of mb layer */
+        pu1_ptr += sizeof(mb_hdr_bdirect_t);
         ps_proc->pv_mb_header_data = pu1_ptr;
 
     }
@@ -835,11 +840,13 @@
 
         /* pointer to mb header storage space */
         UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data;
+        mb_hdr_bskip_t *ps_mb_hdr = (mb_hdr_bskip_t *)ps_proc->pv_mb_header_data;
 
         /* mb type plus mode */
-        *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type;
+        ps_mb_hdr->common.u1_mb_type_mode = (u4_pred_mode << 4) + u4_mb_type;
 
         /* end of mb layer */
+        pu1_ptr += sizeof(mb_hdr_bskip_t);
         ps_proc->pv_mb_header_data = pu1_ptr;
     }
 
diff --git a/encoder/ih264e_structs.h b/encoder/ih264e_structs.h
index 6cbce7c..125db84 100644
--- a/encoder/ih264e_structs.h
+++ b/encoder/ih264e_structs.h
@@ -1151,6 +1151,184 @@
 
 /**
 ******************************************************************************
+*  @brief     mb_hdr structures to access first few common elements of above
+* structures
+******************************************************************************
+*/
+
+typedef struct
+{
+    /**
+     * mb type and mode
+     */
+    UWORD8 u1_mb_type_mode;
+
+    /**
+     * CBP
+     */
+    UWORD8 u1_cbp;
+
+    /**
+     * MB qp delta
+     */
+    UWORD8 u1_mb_qp_delta;
+
+    /**
+     * Element to align structure to 2 byte boundary
+     */
+    UWORD8 u1_pad;
+}mb_hdr_common_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for I4x4 MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+    /**
+     * Sub block modes, 2 modes per byte
+     */
+    UWORD8 au1_sub_blk_modes[8];
+}mb_hdr_i4x4_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for I8x8 MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+
+    /**
+     * Sub block modes, 2 modes per byte
+     */
+    UWORD8 au1_sub_blk_modes[2];
+}mb_hdr_i8x8_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for I16x16 MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+}mb_hdr_i16x16_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for P16x16 MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+    /**
+     * MV
+     */
+    WORD16 ai2_mv[2];
+}mb_hdr_p16x16_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for PSKIP MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+}mb_hdr_pskip_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for B16x16 MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+
+    /**
+     * MV
+     */
+    WORD16 ai2_mv[2][2];
+}mb_hdr_b16x16_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for BDIRECT MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+}mb_hdr_bdirect_t;
+
+/**
+******************************************************************************
+*  @brief      macro block info for PSKIP MB
+******************************************************************************
+*/
+typedef struct
+{
+    /**
+     * Common MB header params
+     */
+    mb_hdr_common_t common;
+
+}mb_hdr_bskip_t;
+
+/**
+******************************************************************************
+*  @brief      Union of mb_hdr structures for size calculation
+*  and to access first few common elements
+******************************************************************************
+*/
+
+typedef union
+{
+    mb_hdr_i4x4_t       mb_hdr_i4x4;
+    mb_hdr_i8x8_t       mb_hdr_i8x8;
+    mb_hdr_i16x16_t     mb_hdr_i16x16;
+    mb_hdr_p16x16_t     mb_hdr_p16x16;
+    mb_hdr_pskip_t      mb_hdr_pskip;
+    mb_hdr_b16x16_t     mb_hdr_b16x16;
+    mb_hdr_bdirect_t    mb_hdr_bdirect;
+    mb_hdr_bskip_t      mb_hdr_bskip;
+}mb_hdr_t;
+/**
+******************************************************************************
 *  @brief      structure presenting the neighbor availability of a mb
 *  or subblk or any other partition
 ******************************************************************************
diff --git a/test/decoder.mk b/test/decoder.mk
index 1a49a92..0dda948 100644
--- a/test/decoder.mk
+++ b/test/decoder.mk
@@ -9,5 +9,5 @@
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/../decoder $(LOCAL_PATH)/../common $(LOCAL_PATH)/decoder/
 LOCAL_SRC_FILES := decoder/main.c
 LOCAL_STATIC_LIBRARIES := libavcdec
-
+LOCAL_SHARED_LIBRARIES := liblog
 include $(BUILD_EXECUTABLE)
diff --git a/test/decoder/dec.cfg b/test/decoder/dec.cfg
new file mode 100644
index 0000000..f452ea1
--- /dev/null
+++ b/test/decoder/dec.cfg
@@ -0,0 +1,12 @@
+--input input.h264
+--save_output 0
+--num_frames -1
+--output out.yuv
+--chroma_format YUV_420P
+--share_display_buf 0
+--num_cores 3
+--loopback 0
+--display 0
+--fps 59.94
+--arch ARM_A9Q
+--soc GENERIC
diff --git a/test/encoder/enc.cfg b/test/encoder/enc.cfg
new file mode 100644
index 0000000..ba62199
--- /dev/null
+++ b/test/encoder/enc.cfg
@@ -0,0 +1,47 @@
+--input input_qvga.yuv
+--output output.264
+--recon recon.yuv
+--chksum chksum.md5
+--chksum_enable 0
+--recon_enable 0
+--input_chroma_format YUV_420P
+--recon_chroma_format YUV_420P
+--qp_i 24
+--qp_p 27
+--qp_b 29
+--qp_i_min 4
+--qp_i_max 49
+--qp_p_min 4
+--qp_p_max 49
+--qp_b_min 4
+--qp_b_max 49
+--max_wd 1920
+--max_ht 1080
+--psnr 0
+--slice 0
+--slice_param 0
+--num_frames -1
+--search_range_x 16
+--search_range_y 16
+--width 320
+--height 240
+--src_framerate 30
+--tgt_framerate 30
+--num_cores 4
+--rc 2
+--bitrate 256000
+--vbv_delay 1000
+--disable_deblock_level 0
+--intra_4x4_enable 1
+--i_interval 1000
+--me_speed 100
+--hpel 1
+--fast_sad 0
+--speed NORMAL
+--max_level 41
+--idr_interval 1000
+--entropy 0
+--bframes 0
+--adaptive_intra_refresh 0
+--air_refresh_period 30
+