common/ihevc_quant_iquant_ssd.c - platform/external/libhevc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
 /**
  *******************************************************************************
  * @file
  *  ihevc_quant_iquant_ssd.c
  *
  * @brief
  *  Contains function definitions for quantization, followed by Inverse
  *  quantization to find transform domain SSD
  *
  * @author
  *  100453, 100578
  *
  * @par List of Functions:
  *   - ihevc_quant_iquant_ssd()
  *   - ihevc_quant_iquant_ssd_flat_scale_mat()
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include "ihevc_typedefs.h"
 #include "ihevc_macros.h"
 #include "ihevc_platform_macros.h"
 #include "ihevc_defs.h"
 #include "ihevc_debug.h"
 #include "ihevc_trans_tables.h"
 #include "ihevc_quant_iquant_ssd.h"
 #include "ihevc_func_selector.h"
 #include "ihevc_trans_macros.h"
 #include <assert.h>

 /*****************************************************************************/
 /* Globals                                                                   */
 /*****************************************************************************/


 /**
  *******************************************************************************
  *
  * @brief
  *  This function performs quantization, followed by Inverse
  *  quantization to find transform domain SSD
  *
  * @par Description:
  *  Performs quantization on coeffs
  *
  * @param[in] pi2_coeffs
  *  4x4 Coeffs
  *
  * @param[in] pi2_quant_coeff
  *  Scaling Matrix
  *
  * @param[out] pi2_dst
  *  Output 4x4 coefficients
  *
  * @param[in] qp_div
  *  Quantization parameter / 6
  *
  * @param[in] qp_rem
  *  Quantization parameter % 6
  *
  * @param[in] src_strd
  *  Input stride
  *
  * @param[in] dst_strd
  *  Output Stride
  *
  * @param[out] csbf
  *  coded sub block flag
  *
  * @param[in] csbf_strd
  *  coded sub block flag
  *
  * @param[out] zero_col
  *  zero column flag
  *
  * @param[out] zero_row
  *  zero column flag
  *
  * @returns  cbf
  * coded block flag
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 WORD32 ihevc_quant_iquant_ssd
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     WORD32 ssd_cost = 0;

     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*  Quantization    */
             QUANT(pi2_q_dst[j], pi2_coeffs[j],
                   pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, q_add);

             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                    /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                    shift_iq,
                    qp_div);

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 /**
  *******************************************************************************
  *
  * @brief
  *  This function performs quantization, followed by Inverse
  *  quantization
  *
  * @par Description:
  *  Performs quantization on coeffs
  *
  * @param[in] pi2_coeffs
  *  4x4 Coeffs
  *
  * @param[in] pi2_quant_coeff
  *  Scaling Matrix
  *
  * @param[out] pi2_dst
  *  Output 4x4 coefficients
  *
  * @param[in] qp_div
  *  Quantization parameter / 6
  *
  * @param[in] qp_rem
  *  Quantization parameter % 6
  *
  * @param[in] src_strd
  *  Input stride
  *
  * @param[in] dst_strd
  *  Output Stride
  *
  * @param[out] csbf
  *  coded sub block flag
  *
  * @param[in] csbf_strd
  *  coded sub block flag
  *
  * @param[out] zero_col
  *  zero column flag
  *
  * @param[out] zero_row
  *  zero column flag
  *
  * @returns  cbf
  * coded block flag
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 WORD32 ihevc_quant_iquant
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)pi8_cost;
     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*  Quantization    */
             QUANT(pi2_q_dst[j], pi2_coeffs[j],
                   pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, q_add);

             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                    shift_iq,
                    qp_div);
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }

             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 /**
  *******************************************************************************
  *
  * @brief
  *  This function performs quantization, followed by Inverse
  *  quantization to find transform domain SSD
  *
  * @par Description:
  *  Performs quantization on coeffs
  *
  * @param[in] pi2_coeffs
  *  4x4 Coeffs
  *
  * @param[in] pi2_quant_coeff
  *  Scaling Matrix
  *
  * @param[out] pi2_dst
  *  Output 4x4 coefficients
  *
  * @param[in] qp_div
  *  Quantization parameter / 6
  *
  * @param[in] qp_rem
  *  Quantization parameter % 6
  *
  * @param[in] src_strd
  *  Input stride
  *
  * @param[in] dst_strd
  *  Output Stride
  *
  * @param[out] csbf
  *  coded sub block flag
  *
  * @param[in] csbf_strd
  *  coded sub block flag
  *
  * @param[out] zero_col
  *  zero column flag
  *
  * @param[out] zero_row
  *  zero column flag
  *
  * @returns  cbf
  * coded block flag
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 WORD32 ihevc_quant_iquant_ssd_rdoq
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     WORD32 ssd_cost = 0;

     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*  Quantization    */
             QUANT(pi2_q_dst[j], pi2_coeffs[j],
                 pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                 log2_size, q_add);


             if (abs(pi2_q_dst[j]) > 1)
             {
                 QUANT(pi2_q_dst[j],i2_temp,
                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));

             }


             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                 /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                 shift_iq,
                 qp_div);

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }
     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 WORD32 ihevc_quant_iquant_rdoq
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)pi8_cost;
     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*  Quantization    */
             QUANT(pi2_q_dst[j], pi2_coeffs[j],
                 pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                 log2_size, q_add);

             if (abs(pi2_q_dst[j]) > 1)
             {
                 QUANT(pi2_q_dst[j],i2_temp,
                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
             }

             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                 shift_iq,
                 qp_div);
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 /**
  *******************************************************************************
  *
  * @brief
  *  This function performs quantization(using flat scale matrix), followed by
  *  inverse quantization to find transform domain SSD
  *
  * @par Description:
  *  Performs quantization on coeffs
  *
  * @param[in] pi2_coeffs
  *  4x4 Coeffs
  *
  * @param[in] pi2_quant_coeff
  *  Scaling Matrix
  *
  * @param[out] pi2_dst
  *  Output 4x4 coefficients
  *
  * @param[in] qp_div
  *  Quantization parameter / 6
  *
  * @param[in] qp_rem
  *  Quantization parameter % 6
  *
  * @param[in] src_strd
  *  Input stride
  *
  * @param[in] dst_strd
  *  Output Stride
  *
  * @param[out] csbf
  *  coded sub block flag
  *
  * @param[in] csbf_strd
  *  coded sub block flag
  *
  * @param[out] zero_col
  *  zero column flag
  *
  * @param[out] zero_row
  *  zero column flag
  *
  * @returns  cbf
  * coded block flag
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     /* Initialize cost to zero */
     WORD32 ssd_cost = 0;

     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*QUANT(pi2_dst[j], pi2_coeffs[j],
             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
             log2_size, q_add);*/

             /* modified by 1028 */
             /*  Quantization    */
             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                   g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, q_add);

             if(pi2_q_dst[j] == 0)
             {
                 pi2_iq_dst[j] = 0;
             }
             else
             {
             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                     shift_iq,
                     qp_div);
             }

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }
     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 WORD32 ihevc_quant_iquant_flat_scale_mat
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)pi8_cost;
     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*  Quantization    */
             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                   g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, q_add);

             if(pi2_q_dst[j] == 0)
             {
                 pi2_iq_dst[j] = 0;
             }
             else
             {
             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                     shift_iq,
                     qp_div);
             }
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 /**
  *******************************************************************************
  *
  * @brief
  *  This function performs quantization(using flat scale matrix), followed by
  *  inverse quantization to find transform domain SSD; when we perform RDOQ.
  *  In case the quantized value turns out to be grater than 1, we then requantize
  *  use half rounding.
  *
  * @par Description:
  *  Performs quantization on coeffs
  *
  * @param[in] pi2_coeffs
  *  4x4 Coeffs
  *
  * @param[in] pi2_quant_coeff
  *  Scaling Matrix
  *
  * @param[out] pi2_dst
  *  Output 4x4 coefficients
  *
  * @param[in] qp_div
  *  Quantization parameter / 6
  *
  * @param[in] qp_rem
  *  Quantization parameter % 6
  *
  * @param[in] src_strd
  *  Input stride
  *
  * @param[in] dst_strd
  *  Output Stride
  *
  * @param[out] csbf
  *  coded sub block flag
  *
  * @param[in] csbf_strd
  *  coded sub block flag
  *
  * @param[out] zero_col
  *  zero column flag
  *
  * @param[out] zero_row
  *  zero column flag
  *
  * @returns  cbf
  * coded block flag
  *
  * @remarks
  *  None
  *
  *******************************************************************************
  */

 WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_rdoq
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     /* Initialize cost to zero */
     WORD32 ssd_cost = 0;

     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             WORD16 i2_temp1;
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*QUANT(pi2_dst[j], pi2_coeffs[j],
             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
             log2_size, q_add);*/

             /* modified by 1028 */
             /*  Quantization    */

             if (1)
             {
                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                   g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, q_add);
             }
             else
             {                                                                                                                                                                \
                 WORD16 inp = pi2_coeffs[j],out = pi2_q_dst[j];
                 WORD32 quant_coeff = g_ihevc_quant_scales[qp_rem];
                 WORD32 log2_trans_size = log2_size;
                 WORD32 tmp;                                                                                                                                                  \
                 WORD32 sign;                                                                                                                                                 \
                 WORD32 bit_depth,transform_shift;                                                                                                                            \
                 WORD32  q_bits, quant_multiplier;                                                                                                                            \
                                                                                                                                                                                 \
                 /* q_bits and q_add calculation*/                                                                                                                            \
                 /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
                 bit_depth = 8;                                                                                                                                               \
                 transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
                 quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
                 q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */;                                                                       \
                                                                                                                                                                                 \
                 sign = (inp)<0 ? -1:1;                                                                                                                                       \
                                                                                                                                                                                 \
                 tmp = (WORD32)(abs(inp));                                                                                                                                    \
                 tmp = tmp * (quant_coeff);                                                                                                                                   \
                 tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
                 tmp = tmp >> q_bits;                                                                                                                                         \
                                                                                                                                                                                 \
                 tmp = tmp * sign;                                                                                                                                            \
                 out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
             }
             i2_temp1 = pi2_q_dst[j];
             if (abs(pi2_q_dst[j]) > 1)
             {
                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
                   g_ihevc_quant_scales[qp_rem], qp_div,
                   log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
             }


             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
             ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));


             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                     pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                     pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                     shift_iq,
                     qp_div);

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;

     }
     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }
     return cbf;
 }

 WORD32 ihevc_quant_iquant_flat_scale_mat_rdoq
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)pi8_cost;
     (void)pi4_quant_round_factor_0_1;
     (void)pi4_quant_round_factor_1_2;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             WORD16 i2_temp1;
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                 g_ihevc_quant_scales[qp_rem], qp_div,
                 log2_size, q_add);

             i2_temp1 = pi2_q_dst[j];

             if (abs(pi2_q_dst[j]) > 1)
             {
                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
                     g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
             }

             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
             ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));

             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                 shift_iq,
                 qp_div);
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }


 /**
 *******************************************************************************
 *
 * @brief
 *  This function performs quantization, followed by Inverse
 *  quantization to find transform domain SSD
 *
 * @par Description:
 *  Performs quantization on coeffs
 *
 * @param[in] pi2_coeffs
 *  4x4 Coeffs
 *
 * @param[in] pi2_quant_coeff
 *  Scaling Matrix
 *
 * @param[out] pi2_dst
 *  Output 4x4 coefficients
 *
 * @param[in] qp_div
 *  Quantization parameter / 6
 *
 * @param[in] qp_rem
 *  Quantization parameter % 6
 *
 * @param[in] src_strd
 *  Input stride
 *
 * @param[in] dst_strd
 *  Output Stride
 *
 * @param[out] csbf
 *  coded sub block flag
 *
 * @param[in] csbf_strd
 *  coded sub block flag
 *
 * @param[out] zero_col
 *  zero column flag
 *
 * @param[out] zero_row
 *  zero column flag
 *
 * @returns  cbf
 * coded block flag
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 WORD32 ihevc_q_iq_ssd_var_rnd_fact
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     //WORD16 i2_temp_1;
     /* Initialize cost to zero */
     WORD32 ssd_cost = 0;

     (void)q_add;
     pi2_q_dst_orig  = pi2_q_dst;


     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];


             {
                 /*  Quantization    */
                 QUANT(pi2_q_dst[j],i2_temp,
                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, 0);
                 if (abs(pi2_q_dst[j]) >= 2)
                 {
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));

                 }
                 else if (abs(pi2_q_dst[j]) >= 1)
                 {
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_1_2);
                 }

                 else
                 {
                     /*  Quantization    */
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_0_1);
                 }

             }


             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                 /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                 shift_iq,
                 qp_div);

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

             pi4_quant_round_factor_0_1++;
             pi4_quant_round_factor_1_2++;
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }
     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 WORD32 ihevc_q_iq_var_rnd_fact
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)q_add;
     (void)pi8_cost;
     pi2_q_dst_orig  = pi2_q_dst;

     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             i2_temp = pi2_coeffs[j];

             {
                 QUANT(pi2_q_dst[j],i2_temp,
                     pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, 0);

                 if (abs(pi2_q_dst[j]) >= 2)
                 {
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
                 }
                 else if (abs(pi2_q_dst[j]) >= 1)
                 {
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_1_2);
                 }
                 else
                 {
                     QUANT(pi2_q_dst[j],i2_temp,
                         pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_0_1);
                 }
             }

             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                 shift_iq,
                 qp_div);

             pi4_quant_round_factor_0_1++;
             pi4_quant_round_factor_1_2++;
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;
     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }

     return cbf;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  This function performs quantization(using flat scale matrix), followed by
 *  inverse quantization to find transform domain SSD; when we perform RDOQ.
 *  In case the quantized value turns out to be grater than 1, we then requantize
 *  use half rounding.
 *
 * @par Description:
 *  Performs quantization on coeffs
 *
 * @param[in] pi2_coeffs
 *  4x4 Coeffs
 *
 * @param[in] pi2_quant_coeff
 *  Scaling Matrix
 *
 * @param[out] pi2_dst
 *  Output 4x4 coefficients
 *
 * @param[in] qp_div
 *  Quantization parameter / 6
 *
 * @param[in] qp_rem
 *  Quantization parameter % 6
 *
 * @param[in] src_strd
 *  Input stride
 *
 * @param[in] dst_strd
 *  Output Stride
 *
 * @param[out] csbf
 *  coded sub block flag
 *
 * @param[in] csbf_strd
 *  coded sub block flag
 *
 * @param[out] zero_col
 *  zero column flag
 *
 * @param[out] zero_row
 *  zero column flag
 *
 * @returns  cbf
 * coded block flag
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD32 val;
     WORD16 i2_temp;
     /* Initialize cost to zero */
     WORD32 ssd_cost = 0;

     (void)q_add;
     pi2_q_dst_orig  = pi2_q_dst;

     /* Quant initialization */
     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             WORD16 i2_temp1;
             /*  Back up the coefficients before Quantization    */
             i2_temp = pi2_coeffs[j];

             /*QUANT(pi2_dst[j], pi2_coeffs[j],
             pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
             log2_size, q_add);*/

             /* modified by 1028 */
             /*  Quantization    */


             {
                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                     g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, 0);

                 i2_temp1 = pi2_q_dst[j];

                 if (abs(pi2_q_dst[j]) >= 2)
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
                 }
                 else if (abs(pi2_q_dst[j]) >= 1)
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_1_2);
                 }

                 else
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_0_1);
                 }

             }


             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);


             /*  Inverse Quantization    */
             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
                 shift_iq,
                 qp_div);

             /*  SSD Computation & Accumulation  */
             val = i2_temp - pi2_iq_dst[j];
             ssd_cost += val*val;

             pi4_quant_round_factor_0_1++;
             pi4_quant_round_factor_1_2++;
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;

     }
     /* Store the cost */
     *pi8_cost = ssd_cost;

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }
     return cbf;
 }

 WORD32 ihevc_q_iq_flat_scale_mat_var_rnd_fact
     (
     WORD16 *pi2_coeffs,
     WORD16 *pi2_quant_coeff,
     WORD16 *pi2_q_dst,
     WORD16 *pi2_iq_dst,
     WORD32  trans_size,
     WORD32 qp_div,/* qpscaled / 6 */
     WORD32 qp_rem,/* qpscaled % 6 */
     WORD32 q_add,
     WORD32 *pi4_quant_round_factor_0_1,
     WORD32 *pi4_quant_round_factor_1_2,
     WORD32 src_strd,
     WORD32 dst_q_strd,
     WORD32 dst_iq_strd,
     UWORD8 *csbf,
     WORD32 csbf_strd,
     WORD32 *zero_col,
     WORD32 *zero_row,
     WORD16 *pi2_dequant_coeff,
     LWORD64 *pi8_cost
     )
 {
     WORD32 i, j;
     WORD32 log2_size;
     WORD16 *pi2_q_dst_orig;
     WORD32 cbf = 0;
     WORD32 bit_depth,shift_iq;
     WORD16 i2_temp;

     (void)q_add;
     (void)pi8_cost;
     pi2_q_dst_orig  = pi2_q_dst;

     GETRANGE(log2_size, trans_size);
     log2_size -= 1;

     bit_depth = 8 + 0;
     shift_iq = bit_depth + log2_size - 5;

     for(i = 0; i < trans_size; i++)
     {
         for(j = 0; j < trans_size; j++)
         {
             WORD16 i2_temp1;

             i2_temp = pi2_coeffs[j];

             {
                 QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                     g_ihevc_quant_scales[qp_rem], qp_div,
                     log2_size, 0);

                 i2_temp1 = pi2_q_dst[j];

                 if (abs(pi2_q_dst[j]) >= 2)
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
                 }
                 else if (abs(pi2_q_dst[j]) >= 1)
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_1_2);
                 }
                 else
                 {
                     QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
                         g_ihevc_quant_scales[qp_rem], qp_div,
                         log2_size, *pi4_quant_round_factor_0_1);
                 }
             }

             ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);

             IQUANT(pi2_iq_dst[j],
                 pi2_q_dst[j], /*pi2_src[index*src_strd]*/
                 pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
                 shift_iq,
                 qp_div);

             pi4_quant_round_factor_0_1++;
             pi4_quant_round_factor_1_2++;
         }

         pi2_q_dst   += dst_q_strd;
         pi2_iq_dst  += dst_iq_strd;
         pi2_quant_coeff += trans_size;
         pi2_coeffs += src_strd;
         pi2_dequant_coeff += trans_size;

     }

     /* CSBF update */
     {
         WORD32 block_row, block_col;
         WORD32 row, col;
         WORD16 *pi2_block;
         UWORD32 temp_zero_col = 0;
         UWORD32 temp_zero_row = 0;

         pi2_q_dst = pi2_q_dst_orig;

         for(block_row = 0; block_row < trans_size; block_row += 4)
         {
             //block_col is incrementing by 1 for easy update of csbf pointer
             for(block_col = 0; block_col < trans_size / 4; block_col++)
             {
                 pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
                 *(csbf + block_col) = 0;

                 for(row = 0; row < 4; row++)
                 {
                     for(col = 0; col < 4; col++)
                     {
                         if(pi2_block[row * dst_q_strd + col] != 0)
                         {
                             *(csbf + block_col) = 1;
                             break;
                         }
                     }
                     if(*(csbf + block_col) == 1)
                     {
                         /* zero_col update *//* temp_zero_col = ~zero_col */
                         temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
                         // zero col can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 colums of 4x4 block
                         // even if any 4x4 csbf is set

                         /* zero row update */ /* temp_zero_row = ~zero_row */
                         temp_zero_row = (temp_zero_row) | (0xFU << block_row);
                         // zero row can be optimized further. Now clearing the
                         // entire 4 bits corresponding to 4 rows of 4x4 block
                         // even if any 4x4 csbf is set

                         break;
                     }
                 }

                 cbf = cbf || (*(csbf + block_col)); // cbf update
             }
             csbf += csbf_strd;
         }

         *zero_col = ~temp_zero_col; //final zero_col storing
         *zero_row = ~temp_zero_row; //final zero_row storing
     }
     return cbf;
 }