libhevc: Fix asan issue in intra pred mode assembly function

chroma intra pred 3_to_9 mode av8 assembly loads 16 bytes ahead of base
address of intermediate buffer but doesnt use it. Allocate this
additional size and offset the same to avoid access violation

Test: hevcenc -c vid_enc_cfg.txt --input selfie_1280_720.yuv --src_width 512 --src_height 512

Bug: 144595488

Change-Id: I1094cf4593ba379354f567876288e2d9b9fae83d
diff --git a/encoder/ihevce_enc_loop_pass.c b/encoder/ihevce_enc_loop_pass.c
index f1cb79c..4943c5c 100644
--- a/encoder/ihevce_enc_loop_pass.c
+++ b/encoder/ihevce_enc_loop_pass.c
@@ -141,6 +141,8 @@
 /* Constant Macros                                                           */
 /*****************************************************************************/
 #define UPDATE_QP_AT_CTB 6
+#define INTRAPRED_SIMD_LEFT_PADDING 16
+#define INTRAPRED_SIMD_RIGHT_PADDING 8
 
 /*****************************************************************************/
 /* Function Definitions                                                      */
@@ -3853,8 +3855,12 @@
     ps_mem_tab[ENC_LOOP_CHROMA_PRED_INTRA].i4_mem_alignment = 8;
 
     /* Memory required to store pred for reference substitution output */
+    /* While (MAX_TU_SIZE * 2 * 2) + 1 is the actual size needed,
+       allocate 16 bytes to the left and 7 bytes to the right to facilitate
+       SIMD access */
     ps_mem_tab[ENC_LOOP_REF_SUB_OUT].i4_mem_size =
-        i4_num_proc_thrds * ((MAX_TU_SIZE * 2 * 2) + 4) *
+        i4_num_proc_thrds * (((MAX_TU_SIZE * 2 * 2) + INTRAPRED_SIMD_RIGHT_PADDING)
+        + INTRAPRED_SIMD_LEFT_PADDING)*
         ((ps_init_prms->s_tgt_lyr_prms.i4_internal_bit_depth > 8) ? 2 : 1) * sizeof(UWORD8);
 
     ps_mem_tab[ENC_LOOP_REF_SUB_OUT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
@@ -3862,8 +3868,12 @@
     ps_mem_tab[ENC_LOOP_REF_SUB_OUT].i4_mem_alignment = 8;
 
     /* Memory required to store pred for reference filtering output */
+    /* While (MAX_TU_SIZE * 2 * 2) + 1 is the actual size needed,
+       allocate 16 bytes to the left and 7 bytes to the right to facilitate
+       SIMD access */
     ps_mem_tab[ENC_LOOP_REF_FILT_OUT].i4_mem_size =
-        i4_num_proc_thrds * ((MAX_TU_SIZE * 2 * 2) + 4) *
+        i4_num_proc_thrds * (((MAX_TU_SIZE * 2 * 2) + INTRAPRED_SIMD_RIGHT_PADDING)
+        + INTRAPRED_SIMD_LEFT_PADDING)*
         ((ps_init_prms->s_tgt_lyr_prms.i4_internal_bit_depth > 8) ? 2 : 1) * sizeof(UWORD8);
 
     ps_mem_tab[ENC_LOOP_REF_FILT_OUT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
@@ -4777,22 +4787,24 @@
 
             /* Memory assignments for reference substitution output */
             {
-                WORD32 pred_buf_size = ((MAX_TU_SIZE * 2 * 2) + 4);
+                WORD32 pred_buf_size = ((MAX_TU_SIZE * 2 * 2) + INTRAPRED_SIMD_RIGHT_PADDING
+                                       + INTRAPRED_SIMD_LEFT_PADDING);
                 WORD32 pred_buf_size_per_thread = pred_buf_size;
                 UWORD8 *pu1_base = (UWORD8 *)ps_mem_tab[ENC_LOOP_REF_SUB_OUT].pv_base +
                                    (ctr * pred_buf_size_per_thread);
 
-                ps_ctxt->pv_ref_sub_out = pu1_base;
+                ps_ctxt->pv_ref_sub_out = pu1_base + INTRAPRED_SIMD_LEFT_PADDING;
             }
 
             /* Memory assignments for reference filtering output */
             {
-                WORD32 pred_buf_size = ((MAX_TU_SIZE * 2 * 2) + 4);
+                WORD32 pred_buf_size = ((MAX_TU_SIZE * 2 * 2) + INTRAPRED_SIMD_RIGHT_PADDING
+                                       + INTRAPRED_SIMD_LEFT_PADDING);
                 WORD32 pred_buf_size_per_thread = pred_buf_size;
                 UWORD8 *pu1_base = (UWORD8 *)ps_mem_tab[ENC_LOOP_REF_FILT_OUT].pv_base +
                                    (ctr * pred_buf_size_per_thread);
 
-                ps_ctxt->pv_ref_filt_out = pu1_base;
+                ps_ctxt->pv_ref_filt_out = pu1_base + INTRAPRED_SIMD_LEFT_PADDING;
             }
 
             /* Memory assignments for recon storage during CU Recursion */