Few bug fixes in loop filter

Removed few unused functions
Fixed an issue in 420P output in shared mdoe
Few bug fixes in handling loop filter corner cases in multi-tile multi-slice clips
Renamed few elements in mv_buf_t for consistency

Bug: 22860270

Change-Id: I8d179b9ad3c7ee3ad27972ee02ea2658fd2c3c25
diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s
index fbd1be1..4781d3e 100644
--- a/common/arm/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm/ihevc_inter_pred_chroma_horz.s
@@ -199,7 +199,8 @@
 @   pld         [r12, r2, lsl #1]
 @   pld         [r4, r2, lsl #1]
 
-
+    pld         [r12, r2, lsl #2]
+    pld         [r4, r2, lsl #2]
 
     subs        r10,r10,#16
 
@@ -212,7 +213,6 @@
 
 
 
-    pld         [r12, r2, lsl #2]
     vqrshrun.s16 d30,q15,#6
 
     vld1.u32    {q0},[r12],r11              @vector load pu1_src
@@ -232,7 +232,6 @@
     vld1.u32    {q3},[r12],r9               @vector load pu1_src
     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
 
-    pld         [r4, r2, lsl #2]
     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
 
     vst1.16     {q15}, [r1],r3
diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s
index ee98923..215f8fd 100644
--- a/common/arm/ihevc_inter_pred_filters_luma_horz.s
+++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s
@@ -342,6 +342,9 @@
     vld1.u32    {q9},[r4],r11
     vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
 
+    pld         [r12, r2, lsl #2]
+    pld         [r4, r2, lsl #2]
+
     add         r4,#8
     vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
 
@@ -373,10 +376,8 @@
 @   cmp         r7, r0
     vmlsl.u8    q11,d5,d26
 
-    pld         [r12, r2, lsl #2]
     vmlal.u8    q11,d13,d28
 
-    pld         [r4, r2, lsl #2]
     vmlal.u8    q11,d17,d30
 
 @   mov         r0, r7
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
index f2431e1..fb75e96 100644
--- a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -41,7 +41,7 @@
 @*
 @* @brief
 @*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
-@*.extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+@* neighboring samples location pointed by 'pu1_ref' to the  tu
 @* block location pointed by 'pu1_dst'
 @*
 @* @par description:
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index 93495f8..ec38786 100644
--- a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -98,7 +98,7 @@
 .extern gai4_ihevc_ang_table
 .extern gai4_ihevc_inv_ang_table
 .extern col_for_intra_luma
-.extern idx_11_17
+.extern idx_neg_idx_11_17
 
 gai4_ihevc_ang_table_addr:
 .long gai4_ihevc_ang_table - ulbl1 - 8
diff --git a/common/arm/ihevc_platform_macros.h b/common/arm/ihevc_platform_macros.h
index 72ef0c3..fc08ba0 100644
--- a/common/arm/ihevc_platform_macros.h
+++ b/common/arm/ihevc_platform_macros.h
@@ -123,6 +123,7 @@
     else
         return 32;
 }
+
 static INLINE UWORD32 CTZ(UWORD32 u4_word)
 {
     if(0 == u4_word)
@@ -136,6 +137,77 @@
 }
 
 
+/**
+******************************************************************************
+ *  @brief  returns postion of msb bit for 32bit input
+******************************************************************************
+ */
+#define GET_POS_MSB_32(r,word)                         \
+{                                                      \
+    if(word)                                           \
+    {                                                  \
+        r = 31 - __builtin_clz(word);                  \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        r = -1;                                        \
+    }                                                  \
+}
+
+/**
+******************************************************************************
+ *  @brief  returns postion of msb bit for 64bit input
+******************************************************************************
+ */
+#define GET_POS_MSB_64(r,word)                         \
+{                                                      \
+    if(word)                                           \
+    {                                                  \
+        r = 63 - __builtin_clzll(word);                \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        r = -1;                                        \
+    }                                                  \
+}
+
+
+/**
+******************************************************************************
+ *  @brief  returns max number of bits required to represent input word (max 32bits)
+******************************************************************************
+ */
+#define GETRANGE(r,word)                               \
+{                                                      \
+    if(word)                                           \
+    {                                                  \
+        r = 32 - __builtin_clz(word);                  \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        r = 1;                                         \
+    }                                                  \
+}
+
+#if 0 /*  Equivalent C code for GETRANGE */
+#define GETRANGE(r,word)    \
+{                           \
+    UWORD32 temp;           \
+    r = 0;                  \
+    temp = (UWORD32)word;   \
+    if(0 == word)           \
+        r = 1;              \
+    else                    \
+    {                       \
+        while(temp)         \
+        {                   \
+            temp >>= 1;     \
+            r++;            \
+        }                   \
+    }\
+}
+#endif
+
 
 
 #define NOP(nop_cnt)    {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
index 3230136..16d3bde 100644
--- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -41,7 +41,7 @@
 //*
 //* @brief
 //*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
-//*.extern  neighboring samples location pointed by 'pu1_ref' to the  tu
+//* neighboring samples location pointed by 'pu1_ref' to the  tu
 //* block location pointed by 'pu1_dst'
 //*
 //* @par description:
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
index dcc0fc7..c68ed70 100644
--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -98,7 +98,7 @@
 .extern gai4_ihevc_ang_table
 .extern gai4_ihevc_inv_ang_table
 .extern col_for_intra_luma
-.extern idx_11_17
+.extern idx_neg_idx_11_17
 
 .type ihevc_intra_pred_luma_mode_11_to_17_av8, %function
 
diff --git a/common/ihevc_common_tables.c b/common/ihevc_common_tables.c
index 7927497..7363cbb 100644
--- a/common/ihevc_common_tables.c
+++ b/common/ihevc_common_tables.c
@@ -453,15 +453,15 @@
     45, 46, 47, 48, 49, 50, 51
 };
 
-// FOR HBD branch encoder ( 8 and 10 bit)
-const WORD8 gai1_ihevc_chroma_qp_scale[70] =  //EXTENDED for 10 bit
+// FOR HBD branch encoder ( 8, 10 and 12 bit)
+const WORD8 gai1_ihevc_chroma_qp_scale[82]=  //EXTENDED for 12 bit
 {
-
-    -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1,
-    0,   1,   2,   3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-    17,  18,  19,  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
-    33,  33,  34,  34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
-    45,  46,  47,  48, 49, 50, 51
+  -24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,
+  -12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,
+  17,18,19,20,21,22,23,24,25,26,27,28,29,29,30,31,32,
+  33,33,34,34,35,35,36,36,37,37,38,39,40,41,42,43,44,
+  45,46,47,48,49,50,51
 };
 
 
diff --git a/common/ihevc_deblk.h b/common/ihevc_deblk.h
index cd4c8c8..6c13c8a 100644
--- a/common/ihevc_deblk.h
+++ b/common/ihevc_deblk.h
@@ -134,6 +134,8 @@
 ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz;
 ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert;
 ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_422chroma_vert;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_422chroma_horz;
 
 ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_a9q;
 ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_a9q;
@@ -154,16 +156,22 @@
 ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_ssse3;
 ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_ssse3;
 ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_ssse3;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_422chroma_vert_ssse3;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_422chroma_horz_ssse3;
 
 ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert;
 ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz;
 ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert;
 ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_422chroma_vert;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_422chroma_horz;
 
 ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert_sse42;
 ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz_sse42;
 ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert_sse42;
 ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz_sse42;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_422chroma_vert_sse42;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_422chroma_horz_sse42;
 
 ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_av8;
 ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_av8;
diff --git a/common/ihevc_deblk_edge_filter.c b/common/ihevc_deblk_edge_filter.c
index 8b6e6ea..adeca78 100644
--- a/common/ihevc_deblk_edge_filter.c
+++ b/common/ihevc_deblk_edge_filter.c
@@ -31,10 +31,6 @@
 *   - ihevc_deblk_luma_horz()
 *   - ihevc_deblk_chroma_vert()
 *   - ihevc_deblk_chroma_horz()
-*   - ihevc_hbd_deblk_luma_vert()
-*   - ihevc_hbd_deblk_luma_horz()
-*   - ihevc_hbd_deblk_chroma_vert()
-*   - ihevc_hbd_deblk_chroma_horz()
 * @remarks
 *  None
 *
@@ -288,244 +284,6 @@
 
 }
 
-
-/**
-*******************************************************************************
-*
-* @brief
-*       Decision process and filtering for the luma block vertical edge for high bit depth.
-*
-* @par Description:
-*     The decision process for the luma block vertical edge is  carried out and
-*     an appropriate filter is applied. The  boundary filter strength, bs should
-*     be greater than 0.  The pcm flags and the transquant bypass flags should
-*     be  taken care of by the calling function.
-*
-* @param[in] pu2_src
-*  Pointer to the src sample q(0,0)
-*
-* @param[in] src_strd
-*  Source stride
-*
-* @param[in] bs
-*  Boundary filter strength of q(0,0)
-*
-* @param[in] quant_param_p
-*  quantization parameter of p block
-*
-* @param[in] quant_param_q
-*  quantization parameter of p block
-*
-* @param[in] beta_offset_div2
-*
-*
-* @param[in] tc_offset_div2
-*
-*
-* @param[in] filter_flag_p
-*  flag whether to filter the p block
-*
-* @param[in] filter_flag_q
-*  flag whether to filter the q block
-*
-* @returns
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
-
-void ihevc_hbd_deblk_luma_vert(UWORD16 *pu2_src,
-                               WORD32 src_strd,
-                               WORD32 bs,
-                               WORD32 quant_param_p,
-                               WORD32 quant_param_q,
-                               WORD32 beta_offset_div2,
-                               WORD32 tc_offset_div2,
-                               WORD32 filter_flag_p,
-                               WORD32 filter_flag_q,
-                               UWORD8 bit_depth)
-{
-    WORD32 qp_luma, beta_indx, tc_indx;
-    WORD32 beta, tc;
-    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
-    WORD32 d_sam0, d_sam3;
-    WORD32 de, dep, deq;
-    WORD32 row;
-    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
-    WORD32 delta, delta_p, delta_q;
-
-    ASSERT((bs > 0) && (bs <= 3));
-    ASSERT(filter_flag_p || filter_flag_q);
-
-    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
-    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
-
-    /* BS based on implementation can take value 3 if it is intra/inter egde          */
-    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
-    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
-    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
-
-    tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
-
-    beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
-    tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
-    if(0 == tc)
-    {
-        return;
-    }
-
-    dq0 = ABS(pu2_src[2] - 2 * pu2_src[1] + pu2_src[0]);
-    dq3 = ABS(pu2_src[3 * src_strd + 2] - 2 * pu2_src[3 * src_strd + 1]
-                    + pu2_src[3 * src_strd + 0]);
-    dp0 = ABS(pu2_src[-3] - 2 * pu2_src[-2] + pu2_src[-1]);
-    dp3 = ABS(pu2_src[3 * src_strd - 3] - 2 * pu2_src[3 * src_strd - 2]
-                    + pu2_src[3 * src_strd - 1]);
-
-    d0 = dp0 + dq0;
-    d3 = dp3 + dq3;
-
-    dp = dp0 + dp3;
-    dq = dq0 + dq3;
-
-    d = d0 + d3;
-
-    de = 0;
-    dep = 0;
-    deq = 0;
-
-    if(d < beta)
-    {
-        d_sam0 = 0;
-        if((2 * d0 < (beta >> 2))
-                        && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
-                                        < (beta >> 3))
-                        && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
-        {
-            d_sam0 = 1;
-        }
-
-        pu2_src += 3 * src_strd;
-        d_sam3 = 0;
-        if((2 * d3 < (beta >> 2))
-                        && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
-                                        < (beta >> 3))
-                        && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
-        {
-            d_sam3 = 1;
-        }
-        pu2_src -= 3 * src_strd;
-
-        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
-        dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
-        deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
-        if(tc <= 1)
-        {
-            dep = 0;
-            deq = 0;
-        }
-    }
-
-    if(de != 0)
-    {
-        for(row = 0; row < 4; row++)
-        {
-            tmp_p0 = pu2_src[-1];
-            tmp_p1 = pu2_src[-2];
-            tmp_p2 = pu2_src[-3];
-
-            tmp_q0 = pu2_src[0];
-            tmp_q1 = pu2_src[1];
-            tmp_q2 = pu2_src[2];
-
-            if(de == 2)
-            {
-                tmp_q0 = CLIP3((pu2_src[2] + 2 * pu2_src[1] +
-                                2 * pu2_src[0] + 2 * pu2_src[-1] +
-                                pu2_src[-2] + 4) >> 3,
-                                pu2_src[0] - 2 * tc,
-                                pu2_src[0] + 2 * tc);
-
-                tmp_q1 = CLIP3((pu2_src[2] + pu2_src[1] + pu2_src[0] +
-                                pu2_src[-1] + 2) >> 2,
-                                pu2_src[1] - 2 * tc,
-                                pu2_src[1] + 2 * tc);
-
-                tmp_q2 = CLIP3((2 * pu2_src[3] + 3 * pu2_src[2] +
-                                pu2_src[1] + pu2_src[0] +
-                                pu2_src[-1] + 4) >> 3,
-                                pu2_src[2] - 2 * tc,
-                                pu2_src[2] + 2 * tc);
-
-                tmp_p0 = CLIP3((pu2_src[1] + 2 * pu2_src[0] +
-                                2 * pu2_src[-1] + 2 * pu2_src[-2] +
-                                pu2_src[-3] + 4) >> 3,
-                                pu2_src[-1] - 2 * tc,
-                                pu2_src[-1] + 2 * tc);
-
-                tmp_p1 = CLIP3((pu2_src[0] + pu2_src[-1] +
-                                pu2_src[-2] + pu2_src[-3] + 2) >> 2,
-                                pu2_src[-2] - 2 * tc,
-                                pu2_src[-2] + 2 * tc);
-
-                tmp_p2 = CLIP3((pu2_src[0] + pu2_src[-1] +
-                                pu2_src[-2] + 3 * pu2_src[-3] +
-                                2 * pu2_src[-4] + 4) >> 3,
-                                pu2_src[-3] - 2 * tc,
-                                pu2_src[-3] + 2 * tc);
-            }
-            else
-            {
-                delta = (9 * (pu2_src[0] - pu2_src[-1]) -
-                                3 * (pu2_src[1] - pu2_src[-2]) + 8) >> 4;
-                if(ABS(delta) < 10 * tc)
-                {
-                    delta = CLIP3(delta, -tc, tc);
-
-                    tmp_p0 = CLIP3(pu2_src[-1] + delta, 0, ((1 << bit_depth) - 1));
-                    tmp_q0 = CLIP3(pu2_src[0] - delta, 0, ((1 << bit_depth) - 1));
-                    if(dep == 1)
-                    {
-                        delta_p = CLIP3((((pu2_src[-3] + pu2_src[-1] + 1) >> 1)
-                                        - pu2_src[-2] + delta) >> 1,
-                                        -(tc >> 1),
-                                        (tc >> 1));
-                        tmp_p1 = CLIP3(pu2_src[-2] + delta_p, 0, ((1 << bit_depth) - 1));
-                    }
-
-                    if(deq == 1)
-                    {
-                        delta_q = CLIP3((((pu2_src[2] + pu2_src[0] + 1) >> 1)
-                                        - pu2_src[1] - delta) >> 1,
-                                        -(tc >> 1),
-                                        (tc >> 1));
-                        tmp_q1 = CLIP3(pu2_src[1] + delta_q, 0, ((1 << bit_depth) - 1));
-                    }
-                }
-            }
-
-            if(filter_flag_p != 0)
-            {
-                pu2_src[-3] = tmp_p2;
-                pu2_src[-2] = tmp_p1;
-                pu2_src[-1] = tmp_p0;
-            }
-
-            if(filter_flag_q != 0)
-            {
-                pu2_src[0] = tmp_q0;
-                pu2_src[1] = tmp_q1;
-                pu2_src[2] = tmp_q2;
-            }
-
-            pu2_src += src_strd;
-        }
-    }
-
-}
-
-
 /**
 *******************************************************************************
 *
@@ -786,266 +544,6 @@
 
 }
 
-
-/**
-*******************************************************************************
-*
-* @brief
-*
-*     Decision process and filtering for the luma block horizontal edge for high bit depth
-*
-* @par Description:
-*     The decision process for the luma block horizontal edge  is carried out
-*    and an appropriate filter is applied. The  boundary filter strength, bs
-*    should be greater than 0.  The pcm flags and the transquant bypass flags
-*    should be  taken care of by the calling function.
-*
-* @param[in] pu1_src
-*  Pointer to the src sample q(0,0)
-*
-* @param[in] src_strd
-*  Source stride
-*
-* @param[in] bs
-*  Boundary filter strength of q(0,0)
-*
-* @param[in] quant_param_p
-*  quantization parameter of p block
-*
-* @param[in] quant_param_q
-*  quantization parameter of p block
-*
-* @param[in] beta_offset_div2
-*
-*
-* @param[in] tc_offset_div2
-*
-*
-* @param[in] filter_flag_p
-*  flag whether to filter the p block
-*
-* @param[in] filter_flag_q
-*  flag whether to filter the q block
-*
-* @returns
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
-
-void ihevc_hbd_deblk_luma_horz(UWORD16 *pu2_src,
-                               WORD32 src_strd,
-                               WORD32 bs,
-                               WORD32 quant_param_p,
-                               WORD32 quant_param_q,
-                               WORD32 beta_offset_div2,
-                               WORD32 tc_offset_div2,
-                               WORD32 filter_flag_p,
-                               WORD32 filter_flag_q,
-                               UWORD8 bit_depth)
-{
-    WORD32 qp_luma, beta_indx, tc_indx;
-    WORD32 beta, tc;
-    WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
-    WORD32 d_sam0, d_sam3;
-    WORD32 de, dep, deq;
-    WORD32 col;
-    WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
-    WORD32 delta, delta_p, delta_q;
-
-    ASSERT((bs > 0));
-    ASSERT(filter_flag_p || filter_flag_q);
-
-    qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
-    beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
-
-    /* BS based on implementation can take value 3 if it is intra/inter egde          */
-    /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
-    /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
-    /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
-
-    tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
-
-    beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
-    tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
-    if(0 == tc)
-    {
-        return;
-    }
-
-    dq0 = ABS(pu2_src[2 * src_strd] - 2 * pu2_src[1 * src_strd] +
-                    pu2_src[0 * src_strd]);
-
-    dq3 = ABS(pu2_src[3 + 2 * src_strd] - 2 * pu2_src[3 + 1 * src_strd] +
-                    pu2_src[3 + 0 * src_strd]);
-
-    dp0 = ABS(pu2_src[-3 * src_strd] - 2 * pu2_src[-2 * src_strd] +
-                    pu2_src[-1 * src_strd]);
-
-    dp3 = ABS(pu2_src[3 - 3 * src_strd] - 2 * pu2_src[3 - 2 * src_strd] +
-                    pu2_src[3 - 1 * src_strd]);
-
-    d0 = dp0 + dq0;
-    d3 = dp3 + dq3;
-
-    dp = dp0 + dp3;
-    dq = dq0 + dq3;
-
-    d = d0 + d3;
-
-    de = 0;
-    dep = 0;
-    deq = 0;
-
-    if(d < beta)
-    {
-        d_sam0 = 0;
-        if((2 * d0 < (beta >> 2))
-                        && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
-                                        ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
-                                        < (beta >> 3))
-                        && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
-                        < ((5 * tc + 1) >> 1))
-        {
-            d_sam0 = 1;
-        }
-
-        pu2_src += 3;
-        d_sam3 = 0;
-        if((2 * d3 < (beta >> 2))
-                        && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
-                                        ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
-                                        < (beta >> 3))
-                        && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
-                        < ((5 * tc + 1) >> 1))
-        {
-            d_sam3 = 1;
-        }
-        pu2_src -= 3;
-
-        de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
-        dep = (dp < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
-        deq = (dq < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
-        if(tc <= 1)
-        {
-            dep = 0;
-            deq = 0;
-        }
-    }
-
-    if(de != 0)
-    {
-        for(col = 0; col < 4; col++)
-        {
-            tmp_p0 = pu2_src[-1 * src_strd];
-            tmp_p1 = pu2_src[-2 * src_strd];
-            tmp_p2 = pu2_src[-3 * src_strd];
-
-            tmp_q0 = pu2_src[0 * src_strd];
-            tmp_q1 = pu2_src[1 * src_strd];
-            tmp_q2 = pu2_src[2 * src_strd];
-            if(de == 2)
-            {
-                tmp_q0 = CLIP3((pu2_src[2 * src_strd] +
-                                2 * pu2_src[1 * src_strd] +
-                                2 * pu2_src[0 * src_strd] +
-                                2 * pu2_src[-1 * src_strd] +
-                                pu2_src[-2 * src_strd] + 4) >> 3,
-                                pu2_src[0 * src_strd] - 2 * tc,
-                                pu2_src[0 * src_strd] + 2 * tc);
-
-                tmp_q1 = CLIP3((pu2_src[2 * src_strd] +
-                                pu2_src[1 * src_strd] +
-                                pu2_src[0 * src_strd] +
-                                pu2_src[-1 * src_strd] + 2) >> 2,
-                                pu2_src[1 * src_strd] - 2 * tc,
-                                pu2_src[1 * src_strd] + 2 * tc);
-
-                tmp_q2 = CLIP3((2 * pu2_src[3 * src_strd] +
-                                3 * pu2_src[2 * src_strd] +
-                                pu2_src[1 * src_strd] +
-                                pu2_src[0 * src_strd] +
-                                pu2_src[-1 * src_strd] + 4) >> 3,
-                                pu2_src[2 * src_strd] - 2 * tc,
-                                pu2_src[2 * src_strd] + 2 * tc);
-
-                tmp_p0 = CLIP3((pu2_src[1 * src_strd] +
-                                2 * pu2_src[0 * src_strd] +
-                                2 * pu2_src[-1 * src_strd] +
-                                2 * pu2_src[-2 * src_strd] +
-                                pu2_src[-3 * src_strd] + 4) >> 3,
-                                pu2_src[-1 * src_strd] - 2 * tc,
-                                pu2_src[-1 * src_strd] + 2 * tc);
-
-                tmp_p1 = CLIP3((pu2_src[0 * src_strd] +
-                                pu2_src[-1 * src_strd] +
-                                pu2_src[-2 * src_strd] +
-                                pu2_src[-3 * src_strd] + 2) >> 2,
-                                pu2_src[-2 * src_strd] - 2 * tc,
-                                pu2_src[-2 * src_strd] + 2 * tc);
-
-                tmp_p2 = CLIP3((pu2_src[0 * src_strd] +
-                                pu2_src[-1 * src_strd] +
-                                pu2_src[-2 * src_strd] +
-                                3 * pu2_src[-3 * src_strd] +
-                                2 * pu2_src[-4 * src_strd] + 4) >> 3,
-                                pu2_src[-3 * src_strd] - 2 * tc,
-                                pu2_src[-3 * src_strd] + 2 * tc);
-            }
-            else
-            {
-                delta = (9 * (pu2_src[0 * src_strd] - pu2_src[-1 * src_strd]) -
-                                3 * (pu2_src[1 * src_strd] - pu2_src[-2 * src_strd]) +
-                                8) >> 4;
-                if(ABS(delta) < 10 * tc)
-                {
-                    delta = CLIP3(delta, -tc, tc);
-                    tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
-                    tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
-                    if(dep == 1)
-                    {
-                        delta_p = CLIP3((((pu2_src[-3 * src_strd] +
-                                        pu2_src[-1 * src_strd] + 1) >> 1) -
-                                        pu2_src[-2 * src_strd] + delta) >> 1,
-                                        -(tc >> 1),
-                                        (tc >> 1));
-                        tmp_p1 = CLIP3(pu2_src[-2 * src_strd] + delta_p, 0, ((1 << bit_depth) - 1));
-                    }
-
-                    if(deq == 1)
-                    {
-                        delta_q = CLIP3((((pu2_src[2 * src_strd] +
-                                        pu2_src[0 * src_strd] + 1) >> 1) -
-                                        pu2_src[1 * src_strd] - delta) >> 1,
-                                        -(tc >> 1),
-                                        (tc >> 1));
-                        tmp_q1 = CLIP3(pu2_src[1 * src_strd] + delta_q, 0, ((1 << bit_depth) - 1));
-                    }
-                }
-            }
-
-            if(filter_flag_p != 0)
-            {
-                pu2_src[-3 * src_strd] = tmp_p2;
-                pu2_src[-2 * src_strd] = tmp_p1;
-                pu2_src[-1 * src_strd] = tmp_p0;
-            }
-
-            if(filter_flag_q != 0)
-            {
-                pu2_src[0 * src_strd] = tmp_q0;
-                pu2_src[1 * src_strd] = tmp_q1;
-                pu2_src[2 * src_strd] = tmp_q2;
-            }
-
-            pu2_src += 1;
-        }
-    }
-}
-
-
 /**
 *******************************************************************************
 *
@@ -1165,122 +663,6 @@
 }
 
 
-/**
-*******************************************************************************
-*
-* @brief
-*     Filtering for the chroma block vertical edge.
-*
-* @par Description:
-*     Filter for chroma vertical edge. The  boundary filter strength, bs
-*    should be greater than 1.  The pcm flags and the transquant bypass flags
-*    should be  taken care of by the calling function.
-*
-* @param[in] pu2_src
-*  Pointer to the src sample q(0,0)
-*
-* @param[in] src_strd
-*  Source stride
-*
-* @param[in] bs
-*  Boundary filter strength of q(0,0)
-*
-* @param[in] quant_param_p
-*  quantization parameter of p block
-*
-* @param[in] quant_param_q
-*  quantization parameter of p block
-*
-* @param[in] beta_offset_div2
-*
-*
-* @param[in] tc_offset_div2
-*
-*
-* @param[in] filter_flag_p
-*  flag whether to filter the p block
-*
-* @param[in] filter_flag_q
-*  flag whether to filter the q block
-*
-* @returns
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
-
-void ihevc_hbd_deblk_chroma_vert(UWORD16 *pu2_src,
-                                 WORD32 src_strd,
-                                 WORD32 quant_param_p,
-                                 WORD32 quant_param_q,
-                                 WORD32 qp_offset_u,
-                                 WORD32 qp_offset_v,
-                                 WORD32 tc_offset_div2,
-                                 WORD32 filter_flag_p,
-                                 WORD32 filter_flag_q,
-                                 UWORD8 bit_depth)
-{
-    WORD32 qp_indx_u, qp_chroma_u;
-    WORD32 qp_indx_v, qp_chroma_v;
-    WORD32 tc_indx_u, tc_u;
-    WORD32 tc_indx_v, tc_v;
-    WORD32 delta_u, tmp_p0_u, tmp_q0_u;
-    WORD32 delta_v, tmp_p0_v, tmp_q0_v;
-    WORD32 row;
-
-    ASSERT(filter_flag_p || filter_flag_q);
-
-    /* chroma processing is done only if BS is 2             */
-    /* this function is assumed to be called only if BS is 2 */
-    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
-    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
-
-    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
-    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
-
-    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
-    tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
-
-    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
-    tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
-
-    if(0 == tc_u && 0 == tc_v)
-    {
-        return;
-    }
-
-    for(row = 0; row < 4; row++)
-    {
-        delta_u = CLIP3((((pu2_src[0] - pu2_src[-2]) << 2) +
-                        pu2_src[-4] - pu2_src[2] + 4) >> 3,
-                        -tc_u, tc_u);
-        tmp_p0_u = CLIP3(pu2_src[-2] + delta_u, 0, ((1 << bit_depth) - 1));
-        tmp_q0_u = CLIP3(pu2_src[0] - delta_u, 0, ((1 << bit_depth) - 1));
-
-        delta_v = CLIP3((((pu2_src[1] - pu2_src[-1]) << 2) +
-                        pu2_src[-3] - pu2_src[3] + 4) >> 3,
-                        -tc_v, tc_v);
-        tmp_p0_v = CLIP3(pu2_src[-1] + delta_v, 0, ((1 << bit_depth) - 1));
-        tmp_q0_v = CLIP3(pu2_src[1] - delta_v, 0, ((1 << bit_depth) - 1));
-        if(filter_flag_p != 0)
-        {
-            pu2_src[-2] = tmp_p0_u;
-            pu2_src[-1] = tmp_p0_v;
-        }
-
-        if(filter_flag_q != 0)
-        {
-            pu2_src[0] = tmp_q0_u;
-            pu2_src[1] = tmp_q0_v;
-        }
-
-        pu2_src += src_strd;
-    }
-
-}
-
 
 /**
 *******************************************************************************
@@ -1395,116 +777,3 @@
 
 }
 
-
-/**
-*******************************************************************************
-*
-* @brief
-*   Filtering for the chroma block horizontal edge.
-*
-* @par Description:
-*     Filter for chroma horizontal edge. The  boundary filter strength, bs
-*    should be greater than 1.  The pcm flags and the transquant bypass flags
-*    should be  taken care of by the calling function.
-*
-* @param[in] pu2_src
-*  Pointer to the src sample q(0,0)
-*
-* @param[in] src_strd
-*  Source stride
-*
-* @param[in] bs
-*  Boundary filter strength of q(0,0)
-*
-* @param[in] quant_param_p
-*  quantization parameter of p block
-*
-* @param[in] quant_param_q
-*  quantization parameter of p block
-*
-* @param[in] beta_offset_div2
-*
-*
-* @param[in] tc_offset_div2
-*
-*
-* @param[in] filter_flag_p
-*  flag whether to filter the p block
-*
-* @param[in] filter_flag_q
-*  flag whether to filter the q block
-*
-* @returns
-*
-* @remarks
-*  None
-*
-*******************************************************************************
-*/
-
-void ihevc_hbd_deblk_chroma_horz(UWORD16 *pu2_src,
-                                 WORD32 src_strd,
-                                 WORD32 quant_param_p,
-                                 WORD32 quant_param_q,
-                                 WORD32 qp_offset_u,
-                                 WORD32 qp_offset_v,
-                                 WORD32 tc_offset_div2,
-                                 WORD32 filter_flag_p,
-                                 WORD32 filter_flag_q,
-                                 UWORD8 bit_depth)
-{
-    WORD32 qp_indx_u, qp_chroma_u;
-    WORD32 qp_indx_v, qp_chroma_v;
-    WORD32 tc_indx_u, tc_u;
-    WORD32 tc_indx_v, tc_v;
-    WORD32 tc;
-
-    WORD32 delta, tmp_p0, tmp_q0;
-    WORD32 col;
-
-    ASSERT(filter_flag_p || filter_flag_q);
-
-    /* chroma processing is done only if BS is 2             */
-    /* this function is assumed to be called only if BS is 2 */
-    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
-    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
-
-    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
-    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
-
-    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
-    tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
-
-    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
-    tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
-
-    if(0 == tc_u && 0 == tc_v)
-    {
-        return;
-    }
-
-    for(col = 0; col < 8; col++)
-    {
-        tc = (col & 1) ? tc_v : tc_u;
-        delta = CLIP3((((pu2_src[0 * src_strd] -
-                      pu2_src[-1 * src_strd]) << 2) +
-                      pu2_src[-2 * src_strd] -
-                      pu2_src[1 * src_strd] + 4) >> 3,
-                      -tc, tc);
-        tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
-        tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
-
-        if(filter_flag_p != 0)
-        {
-            pu2_src[-1 * src_strd] = tmp_p0;
-        }
-
-        if(filter_flag_q != 0)
-        {
-            pu2_src[0 * src_strd] = tmp_q0;
-        }
-
-        pu2_src += 1;
-    }
-
-}
diff --git a/common/ihevc_debug.h b/common/ihevc_debug.h
index fa620fc..e50e733 100644
--- a/common/ihevc_debug.h
+++ b/common/ihevc_debug.h
@@ -53,7 +53,7 @@
 #ifndef ASSERT_EXIT
 
 #define ASSERT(x) assert((x))
-//#define ASSERT(x) ihevcd_debug_assert((x))
+//#define ASSERT(x) ihevcd_debug_ASSERT((x))
 
 #else
 #define ASSERT(x)                        \
diff --git a/common/ihevc_defs.h b/common/ihevc_defs.h
index bd92d7d..7f58121 100644
--- a/common/ihevc_defs.h
+++ b/common/ihevc_defs.h
@@ -404,6 +404,8 @@
 
 #define MAX_HEVC_QP_10bit 63  //FOR HBD Branch Encoder
 
+#define MAX_HEVC_QP_12bit 75  //FOR HBD Branch Encoder
+
 
 /**
  * @brief  Total number of transform sizes
diff --git a/common/ihevc_inter_pred.h b/common/ihevc_inter_pred.h
index b6cca71..e84e912 100644
--- a/common/ihevc_inter_pred.h
+++ b/common/ihevc_inter_pred.h
@@ -381,6 +381,23 @@
 ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_avx2;
 ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_avx2;
 ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_avx2;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_copy_avx2;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_horz_avx2;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_vert_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_copy_w16out_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_horz_w16out_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16out_avx2;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_luma_vert_w16inp_avx2;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16inp_w16out_avx2;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_copy_avx2;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_horz_avx2;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_vert_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_copy_w16out_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_horz_w16out_avx2;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16out_avx2;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_avx2;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_w16out_avx2;
 #endif
 
 /* armv8 function declarations */
diff --git a/common/ihevc_macros.h b/common/ihevc_macros.h
index b3fb743..8ec237b 100644
--- a/common/ihevc_macros.h
+++ b/common/ihevc_macros.h
@@ -37,7 +37,7 @@
 #define RETURN_IF(cond, retval) if(cond) {return (retval);}
 #define UNUSED(x) ((void)(x))
 
-#define CLIP3(x, min, max) (((x) > max) ? max :(((x) < min)? min:(x)))
+#define CLIP3(x, min, max) (((x) > (max)) ? (max) :(((x) < (min))? (min):(x)))
 
 #define MAX(x,y)    (((x) > (y)) ? (x) :(y))
 #define MIN(x,y)    (((x) < (y)) ? (x) :(y))
diff --git a/common/ihevc_structs.h b/common/ihevc_structs.h
index 015bdc7..26c1a39 100644
--- a/common/ihevc_structs.h
+++ b/common/ihevc_structs.h
@@ -465,7 +465,6 @@
      * 4x4 Luma TUs only the fourth one contains cb,cr
      * TODO: Check if this is really needed, cb_cbf and cr_cbf should be enough
      */
-    //UWORD32      b1_chroma_present   : 1;
 
     /**
      *  Y CBF
@@ -477,6 +476,7 @@
      */
     UWORD32      b1_cb_cbf           : 1;
 
+
     /**
      *  Cr CBF
      */
@@ -514,7 +514,6 @@
      */
     UWORD32    b3_chroma_intra_mode_idx    : 3;
 
-
 }tu_t;
 
 /**
@@ -1035,6 +1034,51 @@
     WORD8 i1_frame_only_constraint_flag;
 
     /**
+     * general_max_12bit_constraint_flag
+     */
+    WORD8 i1_general_max_12bit_constraint_flag;
+
+    /**
+     * general_max_10bit_constraint_flag
+     */
+    WORD8 i1_general_max_10bit_constraint_flag;
+
+    /**
+     * general_max_8bit_constraint_flag
+     */
+    WORD8 i1_general_max_8bit_constraint_flag;
+
+    /**
+     * general_max_422chroma_constraint_flag
+     */
+    WORD8 i1_general_max_422chroma_constraint_flag;
+
+    /**
+     * general_max_420chroma_constraint_flag
+     */
+    WORD8 i1_general_max_420chroma_constraint_flag;
+
+    /**
+     * general_max_monochrome_constraint_flag
+     */
+    WORD8 i1_general_max_monochrome_constraint_flag;
+
+    /**
+     * general_intra_constraint_flag
+     */
+    WORD8 i1_general_intra_constraint_flag;
+
+    /**
+     * general_one_picture_only_constraint_flag
+     */
+    WORD8 i1_general_one_picture_only_constraint_flag;
+
+    /**
+     * general_lower_bit_rate_constraint_flag
+     */
+    WORD8 i1_general_lower_bit_rate_constraint_flag;
+
+    /**
      *  level_idc
      */
     UWORD8 u1_level_idc;
@@ -1124,6 +1168,7 @@
     /** delta_chroma_log2_weight_denom */
     WORD8 i1_chroma_log2_weight_denom;
 
+
     /** luma_weight_l0_flag[ i ] */
     WORD8 i1_luma_weight_l0_flag[MAX_DPB_SIZE];
 
@@ -1191,6 +1236,10 @@
 
     /* list_entry_l1[ i ] */
     WORD8 i1_list_entry_l1[16];
+
+    /* Reference POC values for L0,L1 */
+    WORD32 i4_ref_poc_l0[16];
+    WORD32 i4_ref_poc_l1[16];
 }rplm_t;
 
 
@@ -1987,6 +2036,7 @@
     /*************************************************************************/
     WORD16 *pi2_scaling_mat;
 
+
     /*
      * Flag indicating if the SPS is parsed
      */
@@ -2229,6 +2279,7 @@
      */
     WORD8 i1_log2_min_cu_qp_delta_size;
 
+
     /*
      * Flag indicating if the PPS is parsed
      */
@@ -2347,7 +2398,7 @@
     /**
     * Encoder buffer fullness  used in buffering period SEI
     */
-    UWORD32 u4_ebf_sei;
+    UWORD32 u4_dbf_sei;
 
     /**
     * target bitrate used in buffering period SEI
@@ -2510,7 +2561,7 @@
     active_parameter_set_sei_param_t s_active_parameter_set_sei_params;
 
 
-}sei_params_t;
+} sei_params_t;
 
 
 
@@ -2788,6 +2839,7 @@
      */
     WORD16 i2_independent_ctb_y;
 
+
     UWORD8 u1_parse_data_init_done;
 
     /**
diff --git a/common/x86/ihevc_platform_macros.h b/common/x86/ihevc_platform_macros.h
index ae688e6..22c58d2 100644
--- a/common/x86/ihevc_platform_macros.h
+++ b/common/x86/ihevc_platform_macros.h
@@ -91,6 +91,57 @@
     }
 }
 
+/**
+******************************************************************************
+ *  @brief  returns postion of msb bit for 32bit input
+******************************************************************************
+ */
+#define GET_POS_MSB_32(r,word)                         \
+{                                                       \
+    if(word)                                           \
+    {                                                   \
+        r = 31 - __builtin_clz(word);                  \
+    }                                                   \
+    else                                                \
+    {                                                   \
+        r = -1;                                         \
+    }                                                   \
+}
+
+/**
+******************************************************************************
+ *  @brief  returns postion of msb bit for 64bit input
+******************************************************************************
+ */
+#define GET_POS_MSB_64(r,word)                          \
+{                                                       \
+    if(word)                                            \
+    {                                                   \
+        r = 63 - __builtin_clzll(word);                 \
+    }                                                   \
+    else                                                \
+    {                                                   \
+        r = -1;                                         \
+    }                                                   \
+}
+
+
+/**
+******************************************************************************
+ *  @brief  returns max number of bits required to represent input word (max 32bits)
+******************************************************************************
+ */
+#define GETRANGE(r,word)                                \
+{                                                       \
+    if(word)                                            \
+    {                                                   \
+        r = 32 - __builtin_clz(word);                   \
+    }                                                   \
+    else                                                \
+    {                                                   \
+        r = 1;                                          \
+    }                                                   \
+}
 #define GCC_ENABLE 1
 
 #if GCC_ENABLE
diff --git a/decoder.arm.mk b/decoder.arm.mk
index 903822d..91a52e6 100644
--- a/decoder.arm.mk
+++ b/decoder.arm.mk
@@ -3,7 +3,7 @@
 
 libhevcd_srcs_c_arm    +=  decoder/arm/ihevcd_function_selector.c
 libhevcd_srcs_c_arm    +=  decoder/arm/ihevcd_function_selector_noneon.c
-libhevcd_cflags_arm    += -DDISABLE_NEONINTR  -DARM -DARMGCC
+libhevcd_cflags_arm    += -DDISABLE_NEONINTR  -DARM -DARMGCC -fno-tree-vectorize
 
 LOCAL_ARM_MODE         := arm
 
diff --git a/decoder/ihevcd_api.c b/decoder/ihevcd_api.c
index 39c1fb6..ea0b8b5 100644
--- a/decoder/ihevcd_api.c
+++ b/decoder/ihevcd_api.c
@@ -1523,6 +1523,8 @@
     ps_codec->s_parse.i4_first_pic_init = 0;
     ps_codec->i4_error_code = 0;
     ps_codec->i4_reset_flag = 0;
+    ps_codec->i4_cra_as_first_pic = 1;
+    ps_codec->i4_rasl_output_flag = 0;
 
     ps_codec->i4_prev_poc_msb = 0;
     ps_codec->i4_prev_poc_lsb = -1;
@@ -1550,6 +1552,7 @@
     ps_codec->i4_disable_sao_pic    = 0;
     ps_codec->i4_fullpel_inter_pred = 0;
     ps_codec->u4_enable_fmt_conv_ahead = 0;
+    ps_codec->i4_share_disp_buf_cnt = 0;
 
     {
         sps_t *ps_sps = ps_codec->ps_sps_base;
@@ -1925,9 +1928,9 @@
      *
      * One extra MV Bank is needed to hold current pics MV bank.
      * Since this is only a structure allocation and not actual buffer allocation,
-     * it is allocated for BUF_MGR_MAX_CNT entries
+     * it is allocated for (MAX_DPB_SIZE + 1) entries
      */
-    ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+    ps_mem_rec->u4_mem_size += (MAX_DPB_SIZE + 1) * sizeof(mv_buf_t);
 
     {
         /* Allocate for pu_map, pu_t and pic_pu_idx for each MV bank */
@@ -2333,9 +2336,27 @@
     ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
 
     /* In case of non-shared mode allocate for reference picture buffers */
-    if(0 == share_disp_buf)
+    /* In case of shared and 420p output, allocate for chroma samples */
+    if((0 == share_disp_buf) || (chroma_format == IV_YUV_420P))
     {
-        UWORD32 num_reorder_frames_local = num_reorder_frames;
+        UWORD32 init_num_bufs;
+        UWORD32 init_extra_bufs;
+        WORD32 chroma_only;
+
+        chroma_only = 0;
+        init_extra_bufs = 0;
+        init_num_bufs = num_reorder_frames + num_ref_frames + 1;
+
+        /* In case of shared display buffers and chroma format 420P
+         * Allocate for chroma in reference buffers, luma buffer will be display buffer
+         */
+
+        if((1 == share_disp_buf) && (chroma_format == IV_YUV_420P))
+        {
+            chroma_only = 1;
+            init_extra_bufs = num_extra_disp_bufs;
+        }
+
         /* Note: Number of luma samples is not max_wd * max_ht here, instead it is
          * set to maximum number of luma samples allowed at the given level.
          * This is done to ensure that any stream with width and height lesser
@@ -2350,7 +2371,7 @@
          */
         ps_mem_rec->u4_mem_size +=
                         ihevcd_get_total_pic_buf_size(max_wd_luma * max_ht_luma, level,  PAD_WD,  PAD_HT,
-                                                      num_ref_frames, num_reorder_frames_local);
+                                                      init_num_bufs, init_extra_bufs, chroma_only);
     }
     DEBUG("\nMemory record Id %d = %d \n", MEM_REC_REF_PIC,
                     ps_mem_rec->u4_mem_size);
@@ -2960,7 +2981,8 @@
     ps_codec->pv_pic_buf_mgr = ps_mem_rec->pv_base;
     ps_codec->pv_pic_buf_base = (UWORD8 *)ps_codec->pv_pic_buf_mgr + sizeof(buf_mgr_t);
     ps_codec->i4_total_pic_buf_size = ps_mem_rec->u4_mem_size - sizeof(buf_mgr_t);
-
+    ps_codec->pu1_cur_chroma_ref_buf = (UWORD8 *)ps_codec->pv_pic_buf_base + BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+    ps_codec->i4_remaining_pic_buf_size = ps_codec->i4_total_pic_buf_size - BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
 
 
 
@@ -3146,7 +3168,25 @@
             pu1_buf =  ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[0];
             ps_pic_buf->pu1_luma = pu1_buf + strd * PAD_TOP + PAD_LEFT;
 
-            pu1_buf =  ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[1];
+            if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+            {
+                pu1_buf =  ps_codec->pu1_cur_chroma_ref_buf;
+                ps_codec->pu1_cur_chroma_ref_buf += strd * (ps_codec->i4_ht / 2 + PAD_HT / 2);
+                ps_codec->i4_remaining_pic_buf_size -= strd * (ps_codec->i4_ht / 2 + PAD_HT / 2);
+
+                if(0 > ps_codec->i4_remaining_pic_buf_size)
+                {
+                    ps_codec->i4_error_code = IHEVCD_BUF_MGR_ERROR;
+                    return IHEVCD_BUF_MGR_ERROR;
+                }
+
+            }
+            else
+            {
+                /* For YUV 420SP case use display buffer itself as chroma ref buffer */
+                pu1_buf =  ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[1];
+            }
+
             ps_pic_buf->pu1_chroma = pu1_buf + strd * (PAD_TOP / 2) + PAD_LEFT;
 
             buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf, i);
@@ -3166,6 +3206,13 @@
 
             ps_pic_buf++;
 
+            /* Store display buffers in codec context. Needed for 420p output */
+            memcpy(&ps_codec->s_disp_buffer[ps_codec->i4_share_disp_buf_cnt],
+                   &ps_dec_disp_ip->s_disp_buffer[i],
+                   sizeof(ps_dec_disp_ip->s_disp_buffer[i]));
+
+            ps_codec->i4_share_disp_buf_cnt++;
+
         }
     }
 
@@ -3658,8 +3705,7 @@
         {
             strd = s_ctl_dynparams_ip->u4_disp_wd;
         }
-        else if(0 == ps_codec->i4_sps_done ||
-                        0 == ps_codec->i4_pps_done)
+        else if(0 == ps_codec->i4_sps_done)
         {
             strd = s_ctl_dynparams_ip->u4_disp_wd;
         }
diff --git a/decoder/ihevcd_boundary_strength.c b/decoder/ihevcd_boundary_strength.c
index 391ea49..aedac22 100644
--- a/decoder/ihevcd_boundary_strength.c
+++ b/decoder/ihevcd_boundary_strength.c
@@ -386,8 +386,9 @@
                 /*If the 1st slice in a new tile is a dependent slice*/
                 if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
                 {
+                    /* Removed reduntant checks */
                     if((0 == i1_loop_filter_across_slices_enabled_flag && (
-                                    (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+                                    ((slice_idx != left_slice_idx) && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
                                     ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) ||
                                     (0 == ps_bs_ctxt->i4_ctb_x))
                     {
@@ -406,8 +407,9 @@
         }
 
         /* If top neighbor is not available, then set BS for entire first row to zero */
+        /* Removed reduntant checks */
         if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
-                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((slice_idx != top_slice_idx)))
                         || (0 == ps_bs_ctxt->i4_ctb_y))
         {
             pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
@@ -941,9 +943,10 @@
 
                 if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
                 {
+                    /* Removed reduntant checks */
                     if((0 == i1_loop_filter_across_slices_enabled_flag && (
-                                    (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x)
-                                    || ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) || (0 == ps_bs_ctxt->i4_ctb_x))
+                                    (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
+                                    ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) || (0 == ps_bs_ctxt->i4_ctb_x))
                     {
                         pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
                     }
@@ -959,8 +962,9 @@
             top_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
         }
         /* If top neighbor is not available, then set BS for entire first row to zero */
+        /* Removed reduntant checks */
         if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
-                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+                        || (0 == i1_loop_filter_across_slices_enabled_flag && ((slice_idx != top_slice_idx)))
                         || (0 == ps_bs_ctxt->i4_ctb_y))
         {
             pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
diff --git a/decoder/ihevcd_common_tables.c b/decoder/ihevcd_common_tables.c
index 1f6065b..d94a33b 100644
--- a/decoder/ihevcd_common_tables.c
+++ b/decoder/ihevcd_common_tables.c
@@ -38,7 +38,7 @@
 #include "ihevcd_common_tables.h"
 #include "ihevc_defs.h"
 
-WORD16 gai2_ihevcd_chroma_qp[] =
+const WORD16 gai2_ihevcd_chroma_qp[] =
   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29,
     30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38,
diff --git a/decoder/ihevcd_common_tables.h b/decoder/ihevcd_common_tables.h
index 61bc93f..217fb1f 100644
--- a/decoder/ihevcd_common_tables.h
+++ b/decoder/ihevcd_common_tables.h
@@ -34,7 +34,7 @@
 #ifndef _IHEVCD_COMMON_TABLES_H_
 #define _IHEVCD_COMMON_TABLES_H_
 
-extern WORD16 gai2_ihevcd_chroma_qp[];
+extern const WORD16 gai2_ihevcd_chroma_qp[];
 
 extern const UWORD8 gau1_intra_pred_chroma_modes[];
 
diff --git a/decoder/ihevcd_decode.c b/decoder/ihevcd_decode.c
index 0afdd31..94de640 100644
--- a/decoder/ihevcd_decode.c
+++ b/decoder/ihevcd_decode.c
@@ -76,7 +76,7 @@
 #define NUM_FRAMES_LIMIT_ENABLED 0
 
 #if NUM_FRAMES_LIMIT_ENABLED
-#define NUM_FRAMES_LIMIT 3600
+#define NUM_FRAMES_LIMIT 10000
 #else
 #define NUM_FRAMES_LIMIT 0x7FFFFFFF
 #endif
@@ -183,16 +183,16 @@
     ps_dec_op->u4_error_code = ihevcd_map_error((IHEVCD_ERROR_T)ps_codec->i4_error_code);
     ps_dec_op->u4_num_bytes_consumed = ps_dec_ip->u4_num_Bytes
                     - ps_codec->i4_bytes_remaining;
-    if(ps_codec->i4_sps_done)
-    {
-        ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
-        ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
-    }
-    else if(ps_codec->i4_error_code == IHEVCD_UNSUPPORTED_DIMENSIONS)
+    if(ps_codec->i4_error_code == IHEVCD_UNSUPPORTED_DIMENSIONS)
     {
         ps_dec_op->u4_pic_wd = ps_codec->i4_new_max_wd;
         ps_dec_op->u4_pic_ht = ps_codec->i4_new_max_ht;
     }
+    else if(ps_codec->i4_sps_done)
+    {
+        ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
+        ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
+    }
     else
     {
         ps_dec_op->u4_pic_wd = 0;
@@ -347,7 +347,11 @@
     ps_dec_ip = (ivd_video_decode_ip_t *)pv_api_ip;
     ps_dec_op = (ivd_video_decode_op_t *)pv_api_op;
 
-    memset(ps_dec_op, 0, sizeof(ivd_video_decode_op_t));
+    {
+        UWORD32 u4_size = ps_dec_op->u4_size;
+        memset(ps_dec_op, 0, sizeof(ivd_video_decode_op_t));
+        ps_dec_op->u4_size = u4_size; //Restore size field
+    }
     if(ps_codec->i4_init_done != 1)
     {
         ps_dec_op->u4_error_code |= 1 << IVD_FATALERROR;
@@ -603,7 +607,7 @@
             continue;
         }
 
-        if(((IHEVCD_FAIL == ret) && (ps_codec->i4_error_code == IVD_RES_CHANGED)) ||
+        if((IVD_RES_CHANGED == ret) ||
            (IHEVCD_UNSUPPORTED_DIMENSIONS == ret))
         {
             break;
diff --git a/decoder/ihevcd_fmt_conv.c b/decoder/ihevcd_fmt_conv.c
index 94f5e21..4e0e4f7 100644
--- a/decoder/ihevcd_fmt_conv.c
+++ b/decoder/ihevcd_fmt_conv.c
@@ -752,6 +752,24 @@
         pu1_y_src   = pu1_luma + cur_row * ps_codec->i4_strd;
         pu1_uv_src  = pu1_chroma + (cur_row / 2) * ps_codec->i4_strd;
 
+        /* In case of shared mode, with 420P output, get chroma destination */
+        if((1 == ps_codec->i4_share_disp_buf) && (IV_YUV_420P == ps_codec->e_chroma_fmt))
+        {
+            WORD32 i;
+            for(i = 0; i < ps_codec->i4_share_disp_buf_cnt; i++)
+            {
+                WORD32 diff = ps_disp_pic->pu1_luma - ps_codec->s_disp_buffer[i].pu1_bufs[0];
+                if(diff == (ps_codec->i4_strd * PAD_TOP + PAD_LEFT))
+                {
+                    pu1_u_dst = ps_codec->s_disp_buffer[i].pu1_bufs[1];
+                    pu1_u_dst += (ps_codec->i4_strd * PAD_TOP) / 4 + (PAD_LEFT / 2);
+
+                    pu1_v_dst = ps_codec->s_disp_buffer[i].pu1_bufs[2];
+                    pu1_v_dst += (ps_codec->i4_strd * PAD_TOP) / 4 + (PAD_LEFT / 2);
+                    break;
+                }
+            }
+        }
         pu2_rgb_dst_tmp  = (UWORD16 *)pu1_y_dst;
         pu2_rgb_dst_tmp  += cur_row * ps_codec->i4_disp_strd;
         pu4_rgb_dst_tmp  = (UWORD32 *)pu1_y_dst;
diff --git a/decoder/ihevcd_inter_pred.c b/decoder/ihevcd_inter_pred.c
index 889e60b..8e3fe77 100644
--- a/decoder/ihevcd_inter_pred.c
+++ b/decoder/ihevcd_inter_pred.c
@@ -70,7 +70,7 @@
 #include "ihevc_inter_pred.h"
 #include "ihevcd_profile.h"
 
-WORD8 luma_filter[4][NTAPS_LUMA] =
+static WORD8 gai1_luma_filter[4][NTAPS_LUMA] =
 {
     { 0, 0, 0, 64, 0, 0, 0, 0 },
     { -1, 4, -10, 58, 17, -5, 1, 0 },
@@ -78,7 +78,7 @@
     { 0, 1, -5, 17, 58, -10, 4, -1 } };
 
 /* The filter uses only the first four elements in each array */
-WORD8 chroma_filter[8][NTAPS_LUMA] =
+static WORD8 gai1_chroma_filter[8][NTAPS_LUMA] =
 {
     { 0, 64, 0, 0, 0, 0, 0, 0 },
     { -2, 58, 10, -2, 0, 0, 0, 0 },
@@ -308,7 +308,7 @@
                 pu1_dst = pu1_dst_luma + pu_y * ref_strd + pu_x;
 
                 ntaps = NTAPS_LUMA;
-                coeff = luma_filter;
+                coeff = gai1_luma_filter;
             }
 
             else
@@ -354,7 +354,7 @@
                 pu1_dst = pu1_dst_chroma + pu_y * ref_strd / 2 + pu_x;
 
                 ntaps = NTAPS_CHROMA;
-                coeff = chroma_filter;
+                coeff = gai1_chroma_filter;
             }
 
             if(ps_pu->b2_pred_mode != PRED_L1)
diff --git a/decoder/ihevcd_iquant_itrans_recon_ctb.c b/decoder/ihevcd_iquant_itrans_recon_ctb.c
index 4ad2d66..a558644 100644
--- a/decoder/ihevcd_iquant_itrans_recon_ctb.c
+++ b/decoder/ihevcd_iquant_itrans_recon_ctb.c
@@ -80,11 +80,11 @@
 #include "ihevcd_statistics.h"
 #include "ihevcd_itrans_recon_dc.h"
 
-const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+static const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
 
 
 /* Globals */
-WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
+static const WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
   { IP_FUNC_MODE_0, /* Mode 0 */
     IP_FUNC_MODE_1, /* Mode 1 */
     IP_FUNC_MODE_2, /* Mode 2 */
diff --git a/decoder/ihevcd_mv_merge.c b/decoder/ihevcd_mv_merge.c
index 52bab9f..9ec3029 100644
--- a/decoder/ihevcd_mv_merge.c
+++ b/decoder/ihevcd_mv_merge.c
@@ -313,16 +313,16 @@
             if(au4_list_col[0] == 0)
             {
                 col_ref_poc_l0 =
-                                ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+                                ps_mv_buf_col->ai4_l0_collocated_poc[slice_idx][au4_ref_idx_col[0]];
                 col_ref_poc_l0_lt =
-                                (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+                                (ps_mv_buf_col->ai1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
             }
             else
             {
                 col_ref_poc_l0 =
-                                ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+                                ps_mv_buf_col->ai4_l1_collocated_poc[slice_idx][au4_ref_idx_col[0]];
                 col_ref_poc_l0_lt =
-                                (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+                                (ps_mv_buf_col->ai1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
             }
             /* L0 collocated mv */
             ps_pic_buf = (pic_buf_t *)((ps_ref_list[0][ref_idx_l0].pv_pic_buf));
@@ -358,16 +358,16 @@
                 if(au4_list_col[1] == 0)
                 {
                     col_ref_poc_l1 =
-                                    ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+                                    ps_mv_buf_col->ai4_l0_collocated_poc[slice_idx][au4_ref_idx_col[1]];
                     col_ref_poc_l1_lt =
-                                    (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+                                    (ps_mv_buf_col->ai1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
                 }
                 else
                 {
                     col_ref_poc_l1 =
-                                    ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+                                    ps_mv_buf_col->ai4_l1_collocated_poc[slice_idx][au4_ref_idx_col[1]];
                     col_ref_poc_l1_lt =
-                                    (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+                                    (ps_mv_buf_col->ai1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
                 }
 
                 /* L1 collocated mv */
diff --git a/decoder/ihevcd_nal.c b/decoder/ihevcd_nal.c
index cf2208f..bee399f 100644
--- a/decoder/ihevcd_nal.c
+++ b/decoder/ihevcd_nal.c
@@ -358,7 +358,7 @@
             break;
 
         case NAL_CRA         :
-            ps_codec->i4_rasl_output_flag = (0 == ps_codec->u4_pic_cnt) ? 0 : 1;
+            ps_codec->i4_rasl_output_flag = (0 != ps_codec->i4_cra_as_first_pic) ? 0 : 1;
             break;
 
         default:
@@ -393,6 +393,7 @@
             }
 
             ps_codec->i4_header_in_slice_mode = 0;
+            ps_codec->i4_cra_as_first_pic = 0;
 
             ret = ihevcd_parse_slice_header(ps_codec, &s_nal);
             DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
@@ -425,6 +426,7 @@
                 sps_t *ps_sps = ps_codec->ps_sps_base + MAX_SPS_CNT - 1;
                 ihevcd_copy_sps(ps_codec, ps_sps->i1_sps_id, MAX_SPS_CNT - 1);
             }
+            ps_codec->i4_error_code = ret;
 
             DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
             break;
@@ -444,10 +446,14 @@
                 pps_t *ps_pps = ps_codec->ps_pps_base + MAX_PPS_CNT - 1;
                 ihevcd_copy_pps(ps_codec, ps_pps->i1_pps_id, MAX_PPS_CNT - 1);
             }
-
+            ps_codec->i4_error_code = ret;
             DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
             break;
 
+        case NAL_EOS        :
+            ps_codec->i4_cra_as_first_pic = 1;
+            break;
+
         default:
             DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
             break;
diff --git a/decoder/ihevcd_parse_headers.c b/decoder/ihevcd_parse_headers.c
index 2f84d12..2b94131 100644
--- a/decoder/ihevcd_parse_headers.c
+++ b/decoder/ihevcd_parse_headers.c
@@ -1479,8 +1479,7 @@
                     (ps_codec->i4_ht != ps_sps->i2_pic_height_in_luma_samples)))
     {
         ps_codec->i4_reset_flag = 1;
-        ps_codec->i4_error_code = IVD_RES_CHANGED;
-        return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+        return (IHEVCD_ERROR_T)IVD_RES_CHANGED;
     }
 
     /* Update display width and display height */
diff --git a/decoder/ihevcd_parse_slice.c b/decoder/ihevcd_parse_slice.c
index 89c90d9..3bb658e 100644
--- a/decoder/ihevcd_parse_slice.c
+++ b/decoder/ihevcd_parse_slice.c
@@ -2202,9 +2202,8 @@
     {
         if(!ps_slice_hdr->i1_dependent_slice_flag)
         {
-            ps_codec->s_parse.i4_cur_independent_slice_idx++;
-            if(MAX_SLICE_HDR_CNT == ps_codec->s_parse.i4_cur_independent_slice_idx)
-                ps_codec->s_parse.i4_cur_independent_slice_idx = 0;
+            ps_codec->s_parse.i4_cur_independent_slice_idx =
+                    ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1);
         }
     }
 
@@ -2281,8 +2280,8 @@
         {
             for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
             {
-                ps_mv_buf->l1_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->i4_abs_poc;
-                ps_mv_buf->u1_l1_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->u1_used_as_ref;
+                ps_mv_buf->ai4_l1_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->i4_abs_poc;
+                ps_mv_buf->ai1_l1_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->u1_used_as_ref;
             }
         }
 
@@ -2290,8 +2289,8 @@
         {
             for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
             {
-                ps_mv_buf->l0_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->i4_abs_poc;
-                ps_mv_buf->u1_l0_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->u1_used_as_ref;
+                ps_mv_buf->ai4_l0_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->i4_abs_poc;
+                ps_mv_buf->ai1_l0_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->u1_used_as_ref;
             }
         }
     }
@@ -2447,7 +2446,8 @@
                 }*/
             }
 
-            if(!ps_slice_hdr->i1_dependent_slice_flag)
+            /* Cabac init is done unconditionally at the start of the tile irrespective
+             * of whether it is a dependent or an independent slice */
             {
                 ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
                                   &ps_codec->s_parse.s_bitstrm,
@@ -2752,6 +2752,80 @@
             ps_codec->s_parse.pu1_tu_map += num_min4x4_in_ctb;
         }
 
+        /* QP array population has to be done if deblocking is enabled in the picture
+         * but some of the slices in the pic have it disabled */
+        if((0 != ps_codec->i4_disable_deblk_pic) &&
+                (1 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag))
+        {
+            bs_ctxt_t *ps_bs_ctxt = &ps_codec->s_parse.s_bs_ctxt;
+            WORD32 log2_ctb_size = ps_sps->i1_log2_ctb_size;
+            UWORD8 *pu1_qp;
+            WORD32 qp_strd;
+            WORD32 u4_qp_const_in_ctb;
+            WORD32 cur_ctb_idx;
+            WORD32 next_ctb_idx;
+            WORD32 cur_tu_idx;
+            WORD32 i4_ctb_tu_cnt;
+            tu_t *ps_tu;
+
+            cur_ctb_idx = ps_codec->s_parse.i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_codec->s_parse.i4_ctb_y;
+            /* ctb_size/8 elements per CTB */
+            qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+            pu1_qp = ps_bs_ctxt->pu1_pic_qp + ((ps_codec->s_parse.i4_ctb_x + ps_codec->s_parse.i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+            u4_qp_const_in_ctb = ps_bs_ctxt->pu1_pic_qp_const_in_ctb[cur_ctb_idx >> 3] & (1 << (cur_ctb_idx & 7));
+
+            next_ctb_idx = ps_codec->s_parse.i4_next_tu_ctb_cnt;
+            if(1 == ps_codec->i4_num_cores)
+            {
+                i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+                                ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+
+                cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+            }
+            else
+            {
+                i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+                                ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+
+                cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+            }
+
+            ps_tu = &ps_codec->s_parse.ps_pic_tu[cur_tu_idx];
+
+            if(u4_qp_const_in_ctb)
+            {
+                pu1_qp[0] = ps_tu->b7_qp;
+            }
+            else
+            {
+                for(i = 0; i < i4_ctb_tu_cnt; i++, ps_tu++)
+                {
+                    WORD32 start_pos_x;
+                    WORD32 start_pos_y;
+                    WORD32 tu_size;
+
+                    /* start_pos_x and start_pos_y are in units of min TU size (4x4) */
+                    start_pos_x = ps_tu->b4_pos_x;
+                    start_pos_y = ps_tu->b4_pos_y;
+
+                    tu_size = 1 << (ps_tu->b3_size + 2);
+                    tu_size >>= 2; /* TU size divided by 4 */
+
+                    if(0 == (start_pos_x & 1) && 0 == (start_pos_y & 1))
+                    {
+                        WORD32 row, col;
+                        for(row = start_pos_y; row < start_pos_y + tu_size; row += 2)
+                        {
+                            for(col = start_pos_x; col < start_pos_x + tu_size; col += 2)
+                            {
+                                pu1_qp[(row >> 1) * qp_strd + (col >> 1)] = ps_tu->b7_qp;
+                            }
+                        }
+                    }
+                }
+            }
+        }
 
         if(ps_codec->i4_num_cores <= MV_PRED_NUM_CORES_THRESHOLD)
         {
@@ -2962,8 +3036,9 @@
              */
             if(0 == ps_codec->i4_disable_deblk_pic)
             {
-                if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
-                                (0 == ps_codec->i4_slice_error))
+                /* Boundary strength calculation is done irrespective of whether deblocking is disabled
+                 * in the slice or not, to handle deblocking slice boundaries */
+                if((0 == ps_codec->i4_slice_error))
                 {
                     WORD32 i4_ctb_tu_cnt;
                     WORD32 cur_ctb_idx, next_ctb_idx;
@@ -3020,7 +3095,9 @@
                         ihevcd_ctb_boundary_strength_pbslice(&ps_codec->s_parse.s_bs_ctxt);
                     }
                 }
-                else
+
+                /* Boundary strength is set to zero if deblocking is disabled for the current slice */
+                if(0 != ps_slice_hdr->i1_slice_disable_deblocking_filter_flag)
                 {
                     WORD32 bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
 
@@ -3031,9 +3108,8 @@
                                     ps_codec->s_parse.i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
                                     ps_codec->s_parse.i4_ctb_y * bs_strd);
 
-                    memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+                    memset(pu4_vert_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
                     memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
-
                 }
             }
 
diff --git a/decoder/ihevcd_parse_slice_header.c b/decoder/ihevcd_parse_slice_header.c
index 661db13..2afa4d2 100644
--- a/decoder/ihevcd_parse_slice_header.c
+++ b/decoder/ihevcd_parse_slice_header.c
@@ -879,7 +879,7 @@
                     ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
                     for(i = 0; i < BUF_MGR_MAX_CNT; i++)
                     {
-                        if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+                        if(ps_mv_buf && ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
                         {
                             ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
                             break;
@@ -907,11 +907,8 @@
         }
         else
         {
-            WORD32 ret;
-            ret = ihevcd_ref_list(ps_codec, ps_pps, ps_sps, ps_slice_hdr);
+            ihevcd_ref_list(ps_codec, ps_pps, ps_sps, ps_slice_hdr);
 
-            if(IHEVCD_REF_PIC_NOT_FOUND == ret)
-                return IHEVCD_IGNORE_SLICE;
         }
 
     }
diff --git a/decoder/ihevcd_process_slice.c b/decoder/ihevcd_process_slice.c
index 3794313..09b596a 100644
--- a/decoder/ihevcd_process_slice.c
+++ b/decoder/ihevcd_process_slice.c
@@ -787,8 +787,9 @@
                 {
                     if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
                     {
-                        if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
-                                        (0 == ps_codec->i4_slice_error))
+                        /* Boundary strength calculation is done irrespective of whether deblocking is disabled
+                         * in the slice or not, to handle deblocking slice boundaries */
+                        if((0 == ps_codec->i4_slice_error))
                         {
                             ihevcd_update_ctb_tu_cnt(ps_proc);
                             ps_proc->s_bs_ctxt.ps_pps = ps_proc->ps_pps;
@@ -819,7 +820,9 @@
                                 ihevcd_ctb_boundary_strength_pbslice(&ps_proc->s_bs_ctxt);
                             }
                         }
-                        else
+
+                        /* Boundary strength is set to zero if deblocking is disabled for the current slice */
+                        if((0 != ps_slice_hdr->i1_slice_disable_deblocking_filter_flag))
                         {
                             WORD32 bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
 
@@ -830,9 +833,8 @@
                                             ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
                                             ps_proc->i4_ctb_y * bs_strd);
 
-                            memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+                            memset(pu4_vert_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
                             memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
-
                         }
                     }
                 }
@@ -1002,13 +1004,13 @@
 
         while(num_ctb_tmp)
         {
-            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
 
             /* Check proc map to ensure dependencies for deblk are met */
             ihevcd_proc_map_check(ps_proc, proc_type, nctb);
 
             ihevcd_slice_hdr_update(ps_proc);
-            ps_slice_hdr = ps_proc->ps_slice_hdr;
+
 
             if(((0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)) &&
                (0 == ps_codec->i4_disable_deblk_pic))
@@ -1016,9 +1018,9 @@
                 WORD32 i4_is_last_ctb_x = 0;
                 WORD32 i4_is_last_ctb_y = 0;
 
-                if(0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag ||
-                                (ps_proc->i4_ctb_slice_x == 0) ||
-                                (ps_proc->i4_ctb_slice_y == 0))
+
+                /* Deblocking is done irrespective of whether it is disabled in the slice or not,
+                 * to handle deblocking the slice boundaries */
                 {
                     ps_proc->s_deblk_ctxt.ps_pps = ps_proc->ps_pps;
                     ps_proc->s_deblk_ctxt.ps_sps = ps_proc->ps_sps;
@@ -1111,18 +1113,20 @@
 
         while(num_ctb_tmp)
         {
-            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
 
             /* Check proc map to ensure dependencies for SAO are met */
             ihevcd_proc_map_check(ps_proc, proc_type, nctb);
 
             ihevcd_slice_hdr_update(ps_proc);
-            ps_slice_hdr = ps_proc->ps_slice_hdr;
+
 
             if(0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)
             {
-                if((0 == ps_codec->i4_disable_sao_pic) &&
-                                (ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag))
+                /* SAO is done even when it is disabled in the current slice, because
+                 * it is performed on a shifted CTB and the neighbor CTBs can belong
+                 * to different slices with SAO enabled */
+                if(0 == ps_codec->i4_disable_sao_pic)
                 {
                     ps_proc->s_sao_ctxt.ps_pps = ps_proc->ps_pps;
                     ps_proc->s_sao_ctxt.ps_sps = ps_proc->ps_sps;
diff --git a/decoder/ihevcd_ref_list.c b/decoder/ihevcd_ref_list.c
index 976df97..76bb476 100644
--- a/decoder/ihevcd_ref_list.c
+++ b/decoder/ihevcd_ref_list.c
@@ -505,7 +505,7 @@
             ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
             for(i = 0; i < BUF_MGR_MAX_CNT; i++)
             {
-                if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+                if(ps_mv_buf && ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
                 {
                     ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
                     break;
diff --git a/decoder/ihevcd_sao.c b/decoder/ihevcd_sao.c
index 8da9c0f..2702317 100644
--- a/decoder/ihevcd_sao.c
+++ b/decoder/ihevcd_sao.c
@@ -616,7 +616,9 @@
     {
         /* Check the loop filter flags and copy the original values for back up */
         /* Luma */
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+
+        /* Done unconditionally since SAO is done on a shifted CTB and the constituent CTBs
+         * can belong to different slice with their own sao_enable flag */
         {
             UWORD32 u4_no_loop_filter_flag;
             WORD32 loop_filter_bit_pos;
@@ -695,7 +697,7 @@
         }
 
         /* Chroma */
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+
         {
             UWORD32 u4_no_loop_filter_flag;
             WORD32 loop_filter_bit_pos;
@@ -789,6 +791,13 @@
         WORD32 ctby_tl_t = 0, ctby_tl_l = 0, ctby_tl_r = 0, ctby_tl_d = 0, ctby_tl = 0;
         WORD32 au4_idx_tl[8], idx_tl;
 
+        slice_header_t *ps_slice_hdr_top_left;
+        {
+            WORD32 top_left_ctb_indx = (ps_sao_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb +
+                                        (ps_sao_ctxt->i4_ctb_x - 1);
+            ps_slice_hdr_top_left = ps_slice_hdr_base + pu1_slice_idx[top_left_ctb_indx];
+        }
+
 
         pu1_src_luma -= (sao_wd_luma + sao_ht_luma * src_strd);
         pu1_src_chroma -= (sao_wd_chroma + sao_ht_chroma * src_strd);
@@ -798,7 +807,7 @@
         pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma;
         pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - (2 * sao_ht_chroma);
 
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+        if(ps_slice_hdr_top_left->i1_slice_sao_luma_flag)
         {
             if(0 == ps_sao->b3_y_type_idx)
             {
@@ -960,6 +969,11 @@
                             au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_tl[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                             au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_tl[7])->i1_slice_loop_filter_across_slices_enabled_flag;
 
+                            if(au4_idx_tl[5] > idx_tl)
+                            {
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + au4_idx_tl[5])->i1_slice_loop_filter_across_slices_enabled_flag;
+                            }
+
                             /*
                              * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                              * of the pixel having a greater address is checked. Accordingly, set the availability flags.
@@ -1093,8 +1107,19 @@
             }
 
         }
+        else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+        {
+            /* Update left, top and top-left */
+            for(row = 0; row < sao_ht_luma; row++)
+            {
+                pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+            }
+            pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
 
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+        }
+
+        if(ps_slice_hdr_top_left->i1_slice_sao_chroma_flag)
         {
             if(0 == ps_sao->b3_cb_type_idx)
             {
@@ -1251,7 +1276,7 @@
                             else
                             {
                                 au4_ilf_across_tile_slice_enable[4] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
-                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + au4_idx_tl[5])->i1_slice_loop_filter_across_slices_enabled_flag;
                             }
                             au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
                             au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
@@ -1415,6 +1440,18 @@
                 }
             }
         }
+        else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+        {
+            for(row = 0; row < sao_ht_chroma; row++)
+            {
+                pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+            }
+            pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+            pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+            ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+        }
 
         pu1_src_luma += sao_wd_luma + sao_ht_luma * src_strd;
         pu1_src_chroma += sao_wd_chroma + sao_ht_chroma * src_strd;
@@ -1436,6 +1473,13 @@
 
         WORD32 remaining_cols;
 
+        slice_header_t *ps_slice_hdr_top;
+        {
+            WORD32 top_ctb_indx = (ps_sao_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb +
+                                        (ps_sao_ctxt->i4_ctb_x);
+            ps_slice_hdr_top = ps_slice_hdr_base + pu1_slice_idx[top_ctb_indx];
+        }
+
         remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_luma);
         if(remaining_cols <= SAO_SHIFT_CTB)
         {
@@ -1457,7 +1501,7 @@
 
         if(0 != sao_wd_luma)
         {
-            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+            if(ps_slice_hdr_top->i1_slice_sao_luma_flag)
             {
                 if(0 == ps_sao->b3_y_type_idx)
                 {
@@ -1583,6 +1627,12 @@
                                 au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                                if(au4_idx_t[6] < idx_t)
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 /*
                                  * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                                  * of the pixel having a greater address is checked. Accordingly, set the availability flags
@@ -1703,11 +1753,22 @@
                     }
                 }
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                /* Update left, top and top-left */
+                for(row = 0; row < sao_ht_luma; row++)
+                {
+                    pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                }
+                pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+            }
         }
 
         if(0 != sao_wd_chroma)
         {
-            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            if(ps_slice_hdr_top->i1_slice_sao_chroma_flag)
             {
                 if(0 == ps_sao->b3_cb_type_idx)
                 {
@@ -1837,11 +1898,17 @@
                                     au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_t[6])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 }
 
-                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + au4_idx_t[5])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                                if(idx_t > au4_idx_t[6])
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 /*
                                  * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                                  * of the pixel having a greater address is checked. Accordingly, set the availability flags
@@ -1981,6 +2048,18 @@
 
                 }
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                for(row = 0; row < sao_ht_chroma; row++)
+                {
+                    pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                    pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                }
+                pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+            }
         }
 
         pu1_src_luma += sao_ht_luma * src_strd;
@@ -2001,6 +2080,13 @@
         WORD32 au4_idx_l[8], idx_l;
 
         WORD32 remaining_rows;
+        slice_header_t *ps_slice_hdr_left;
+        {
+            WORD32 left_ctb_indx = (ps_sao_ctxt->i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb +
+                                        (ps_sao_ctxt->i4_ctb_x - 1);
+            ps_slice_hdr_left = ps_slice_hdr_base + pu1_slice_idx[left_ctb_indx];
+        }
+
         remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + sao_ht_luma);
         if(remaining_rows <= SAO_SHIFT_CTB)
         {
@@ -2023,7 +2109,7 @@
 
         if(0 != sao_ht_luma)
         {
-            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+            if(ps_slice_hdr_left->i1_slice_sao_luma_flag)
             {
                 if(0 == ps_sao->b3_y_type_idx)
                 {
@@ -2143,6 +2229,12 @@
                                 au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_l[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+                                if(idx_l < au4_idx_l[5])
+                                {
+                                    au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + au4_idx_l[5])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 /*
                                  * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                                  * of the pixel having a greater address is checked. Accordingly, set the availability flags
@@ -2259,11 +2351,23 @@
 
                 }
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                /* Update left, top and top-left */
+                for(row = 0; row < sao_ht_luma; row++)
+                {
+                    pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                }
+                /*Update in next location*/
+                pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+            }
         }
 
         if(0 != sao_ht_chroma)
         {
-            if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+            if(ps_slice_hdr_left->i1_slice_sao_chroma_flag)
             {
                 if(0 == ps_sao->b3_cb_type_idx)
                 {
@@ -2390,6 +2494,12 @@
                                     au4_ilf_across_tile_slice_enable[2] =  (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
                                     au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
                                 }
+
+                                if(au4_idx_l[5] > idx_l)
+                                {
+                                    au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + au4_idx_l[5])->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 //  au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
@@ -2539,6 +2649,18 @@
 
                 }
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                for(row = 0; row < sao_ht_chroma; row++)
+                {
+                    pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                    pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                }
+                pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+            }
 
         }
         pu1_src_luma += sao_wd_luma;
@@ -2729,6 +2851,11 @@
                                 au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
 
+                                if(au4_idx_c[6] < idx_c)
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 /*
                                  * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                                  * of the pixel having a greater address is checked. Accordingly, set the availability flags
@@ -2858,6 +2985,19 @@
                     pu1_sao_src_top_left_luma_bot_left[0] = pu1_src_luma[(sao_ht_luma)*src_strd + sao_wd_luma - 1];
                 }
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                /* Update left, top and top-left */
+                for(row = 0; row < sao_ht_luma; row++)
+                {
+                    pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+                }
+                pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+                pu1_sao_src_top_left_luma_top_right[0] = pu1_src_luma[(sao_ht_luma - 1) * src_strd + sao_wd_luma];
+            }
         }
 
         if((0 != sao_wd_chroma) && (0 != sao_ht_chroma))
@@ -3017,6 +3157,11 @@
                                 au4_ilf_across_tile_slice_enable[3] &= (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
                                 au4_ilf_across_tile_slice_enable[7] &= (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
 
+                                if(idx_c > au4_idx_c[6])
+                                {
+                                    au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+                                }
+
                                 /*
                                  * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
                                  * of the pixel having a greater address is checked. Accordingly, set the availability flags
@@ -3172,6 +3317,21 @@
                 pu1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 2];
                 pu1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 1];
             }
+            else if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+            {
+                for(row = 0; row < sao_ht_chroma; row++)
+                {
+                    pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+                    pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+                }
+                pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+                pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+                ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+                pu1_sao_src_top_left_chroma_top_right[0] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma];
+                pu1_sao_src_top_left_chroma_top_right[1] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma + 1];
+            }
 
         }
     }
@@ -3182,7 +3342,7 @@
 /* If no loop filter is enabled copy the backed up values */
     {
         /* Luma */
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag && no_loop_filter_enabled_luma)
+        if(no_loop_filter_enabled_luma)
         {
             UWORD32 u4_no_loop_filter_flag;
             WORD32 loop_filter_bit_pos;
@@ -3260,7 +3420,7 @@
         }
 
         /* Chroma */
-        if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag && no_loop_filter_enabled_chroma)
+        if(no_loop_filter_enabled_chroma)
         {
             UWORD32 u4_no_loop_filter_flag;
             WORD32 loop_filter_bit_pos;
diff --git a/decoder/ihevcd_structs.h b/decoder/ihevcd_structs.h
index e147521..1e6bc20 100644
--- a/decoder/ihevcd_structs.h
+++ b/decoder/ihevcd_structs.h
@@ -196,21 +196,21 @@
     /**
      * Absolute POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
      */
-    WORD32 l0_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+    WORD32 ai4_l0_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
 
     /**
      * Flag to indicate Long Term reference for POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
      */
-    WORD8 u1_l0_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+    WORD8 ai1_l0_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
 
     /**
      * Absolute POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
      */
-    WORD32 l1_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+    WORD32 ai4_l1_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
     /**
      * Flag to indicate Long Term reference for POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
      */
-    WORD32 u1_l1_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+    WORD8 ai1_l1_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
 
 }mv_buf_t;
 
@@ -1713,6 +1713,12 @@
     WORD32 i4_rasl_output_flag;
 
     /**
+     * This flag is set if the next picture received is a CRA and has to be treated as a first pic in the video sequence
+     * For example, it is set, if an EOS (end of stream) NAL is received
+     */
+    WORD32 i4_cra_as_first_pic;
+
+    /**
      * Pictures that are are degraded
      * 0 : No degrade
      * 1 : Only on non-reference frames
@@ -1975,6 +1981,15 @@
      */
     WORD32 i4_total_pic_buf_size;
 
+    /**
+     * Remaining pic buffer size - used for shared mode with 420p support
+     */
+    WORD32 i4_remaining_pic_buf_size;
+
+    /**
+     * Current chroma buffer base - used for shared mode with 420p output
+     */
+    UWORD8 *pu1_cur_chroma_ref_buf;
 
     /**
      * Picture buffer manager
@@ -2139,6 +2154,12 @@
     IVD_ARCH_T e_processor_arch;
     /**  Processor soc */
     IVD_SOC_T e_processor_soc;
+
+    /** Display buffer array - for shared mode */
+    ivd_out_bufdesc_t s_disp_buffer[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+    /** Number of active display buffers - for shared mode */
+    WORD32  i4_share_disp_buf_cnt;
 };
 
 #endif /* _IHEVCD_STRUCTS_H_ */
diff --git a/decoder/ihevcd_utils.c b/decoder/ihevcd_utils.c
index c46a4e6..5c10014 100644
--- a/decoder/ihevcd_utils.c
+++ b/decoder/ihevcd_utils.c
@@ -246,13 +246,14 @@
                                      WORD32 level,
                                      WORD32 horz_pad,
                                      WORD32 vert_pad,
-                                     WORD32 num_ref_frames,
-                                     WORD32 num_reorder_frames)
+                                     WORD32 init_num_bufs,
+                                     WORD32 init_extra_bufs,
+                                     WORD32 chroma_only)
 {
     WORD32 size;
     WORD32 num_luma_samples;
     WORD32 lvl_idx;
-    WORD32 max_wd;
+    WORD32 max_wd, min_ht;
     WORD32 max_dpb_size;
     WORD32 num_samples;
     WORD32 max_num_bufs;
@@ -267,20 +268,29 @@
     /* If num_ref_frames and num_reorder_frmaes is specified
      * Use minimum value
      */
-    max_num_bufs = MIN(max_num_bufs, (num_ref_frames + num_reorder_frames + 1));
+    max_num_bufs = MIN(max_num_bufs, init_num_bufs);
+
+    /*
+     * Add extra buffers if required
+     */
+    max_num_bufs += init_extra_bufs;
+    max_num_bufs = MIN(max_num_bufs, BUF_MGR_MAX_CNT);
 
     /* Get level index */
     lvl_idx = ihevcd_get_lvl_idx(level);
 
-    /* Maximum number of luma samples in a picture at given level */
-    num_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
-
-    /* Account for chroma */
-    num_samples = num_luma_samples * 3 / 2;
 
     /* Maximum width of luma samples in a picture at given level */
-    max_wd = gai4_ihevc_max_wd_ht[lvl_idx];
+    max_wd = ALIGN64(gai4_ihevc_max_wd_ht[lvl_idx]);
 
+    /* Minimum height of luma samples in a picture at given level */
+    min_ht = ALIGN64(gai4_ihevc_min_wd_ht[lvl_idx]);
+
+    /* Use max_wd and min_ht to get maximum number of luma samples for given level */
+    /* Because max_wd and min_ht are aligned to 64, product will be higher than the
+     * value given by the spec for a given level
+     */
+    num_luma_samples = max_wd * min_ht;
 
     /* Allocation is required for
      * (Wd + horz_pad) * (Ht + vert_pad) * (2 * max_dpb_size + 1)
@@ -299,11 +309,19 @@
      * So use max_wd and min_ht
      */
 
+    /* Account for padding area */
+
+    num_luma_samples += (pad * pad) + pad * (max_wd + min_ht);
+
+    /* Account for chroma */
+    if(0 == chroma_only)
+        num_samples = num_luma_samples * 3 / 2;
+    else
+        num_samples = num_luma_samples / 2;
+
     /* Number of bytes in reference pictures */
     size = num_samples * max_num_bufs;
 
-    /* Account for padding area */
-    size += ((pad * pad) + pad * (max_wd + max_wd)) * max_num_bufs;
 
     return size;
 }
@@ -354,7 +372,6 @@
     /* Size for storing pu_t for each PU */
     mv_bank_size += num_pu * sizeof(pu_t);
 
-
     size =  mv_bank_size;
     return size;
 }
@@ -683,9 +700,9 @@
     pu1_buf = (UWORD8 *)ps_codec->pv_mv_bank_buf_base;
 
     ps_mv_buf = (mv_buf_t *)pu1_buf;
-    pu1_buf += BUF_MGR_MAX_CNT  * sizeof(mv_buf_t);
+    pu1_buf += (MAX_DPB_SIZE + 1) * sizeof(mv_buf_t);
     ps_codec->ps_mv_buf = ps_mv_buf;
-    mv_bank_size_allocated = ps_codec->i4_total_mv_bank_size - BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+    mv_bank_size_allocated = ps_codec->i4_total_mv_bank_size - (MAX_DPB_SIZE + 1) * sizeof(mv_buf_t);
 
     /* Compute MV bank size per picture */
     pic_mv_bank_size = ihevcd_get_pic_mv_bank_size(ps_sps->i2_pic_width_in_luma_samples *
diff --git a/decoder/ihevcd_utils.h b/decoder/ihevcd_utils.h
index c2cbcc4..7282ae7 100644
--- a/decoder/ihevcd_utils.h
+++ b/decoder/ihevcd_utils.h
@@ -55,6 +55,7 @@
                                      WORD32 level,
                                      WORD32 horz_pad,
                                      WORD32 vert_pad,
-                                     WORD32 num_ref_frames,
-                                     WORD32 num_reorder_frames);
+                                     WORD32 init_num_bufs,
+                                     WORD32 init_extra_bufs,
+                                     WORD32 chroma_only);
 #endif /* _IHEVCD_UTILS_H_ */
diff --git a/decoder/ihevcd_version.c b/decoder/ihevcd_version.c
index a47c6fc..b1b6e21 100644
--- a/decoder/ihevcd_version.c
+++ b/decoder/ihevcd_version.c
@@ -62,7 +62,7 @@
  * Version string. First two digits signify major version and last two minor
  * Increment major version for API change or major feature update
  */
-#define CODEC_RELEASE_VER       "04.01"
+#define CODEC_RELEASE_VER       "04.04"
 /**
  * Vendor name
  */
diff --git a/test/decoder.mk b/test/decoder.mk
index ef560b3..6c64ac3 100644
--- a/test/decoder.mk
+++ b/test/decoder.mk
@@ -5,7 +5,7 @@
 LOCAL_MODULE    := hevcdec
 LOCAL_MODULE_TAGS := optional
 
-LOCAL_CFLAGS := -DPROFILE_ENABLE -DARM  -fPIC
+LOCAL_CFLAGS := -DPROFILE_ENABLE -DARM  -fPIC -DMD5_DISABLE
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/../decoder $(LOCAL_PATH)/../common $(LOCAL_PATH)/
 LOCAL_SRC_FILES := decoder/main.c
 LOCAL_STATIC_LIBRARIES := libhevcdec
diff --git a/test/decoder/main.c b/test/decoder/main.c
index 5c2d438..f102add 100644
--- a/test/decoder/main.c
+++ b/test/decoder/main.c
@@ -57,7 +57,7 @@
 #include "ithread.h"
 
 
-#define MD5_DISABLE
+//#define MD5_DISABLE
 #ifdef X86_MSVC
 #include <windows.h>
 #else
@@ -1781,10 +1781,6 @@
 #endif
     WORD32 width = 0, height = 0;
     iv_obj_t *codec_obj;
-#if defined(GPU_BUILD) && !defined(X86)
-//    int ioctl_init();
-//    ioctl_init();
-#endif
 
 #ifdef X86_MINGW
     //For getting printfs without any delay