encoder/ih264e_core_coding.h - platform/external/libavc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */

 /**
 ******************************************************************************
 * @file
 *  ih264e_core_coding.h
 *
 * @brief
 *  This file contains extern declarations of core coding routines
 *
 * @author
 *  ittiam
 *
 * @remarks
 *  none
 ******************************************************************************
 */

 #ifndef IH264E_CORE_CODING_H_
 #define IH264E_CORE_CODING_H_

 /*****************************************************************************/
 /* Constant Macros                                                           */
 /*****************************************************************************/

 /**
 ******************************************************************************
  *  @brief      Enable/Disable Hadamard transform of DC Coeff's
 ******************************************************************************
  */
 #define DISABLE_DC_TRANSFORM 0
 #define ENABLE_DC_TRANSFORM 1

 /**
 *******************************************************************************
  *  @brief bit masks for DC and AC control flags
 *******************************************************************************
  */

 #define DC_COEFF_CNT_LUMA_MB        16
 #define NUM_4X4_BLKS_LUMA_MB_ROW    4
 #define NUM_LUMA4x4_BLOCKS_IN_MB    16
 #define NUM_CHROMA4x4_BLOCKS_IN_MB  8

 #define SIZE_4X4_BLK_HRZ            TRANS_SIZE_4
 #define SIZE_4X4_BLK_VERT           TRANS_SIZE_4

 #define CNTRL_FLAG_DC_MASK_LUMA     0x0000FFFF
 #define CNTRL_FLAG_AC_MASK_LUMA     0xFFFF0000

 #define CNTRL_FLAG_AC_MASK_CHROMA_U 0xF0000000
 #define CNTRL_FLAG_DC_MASK_CHROMA_U 0x0000F000

 #define CNTRL_FLAG_AC_MASK_CHROMA_V 0x0F000000
 #define CNTRL_FLAG_DC_MASK_CHROMA_V 0x00000F00

 #define CNTRL_FLAG_AC_MASK_CHROMA   ( CNTRL_FLAG_AC_MASK_CHROMA_U | CNTRL_FLAG_AC_MASK_CHROMA_V )
 #define CNTRL_FLAG_DC_MASK_CHROMA   ( CNTRL_FLAG_DC_MASK_CHROMA_U | CNTRL_FLAG_DC_MASK_CHROMA_V )

 #define CNTRL_FLAG_DCBLK_MASK_CHROMA 0x0000C000

 /**
 *******************************************************************************
  *  @brief macros for transforms
 *******************************************************************************
  */
 #define DEQUEUE_BLKID_FROM_CONTROL( u4_cntrl,  blk_lin_id)                     \
 {                                                                              \
   blk_lin_id = CLZ(u4_cntrl);                                                  \
   u4_cntrl &= (0x7FFFFFFF >> blk_lin_id);                                      \
 };

 #define IND2SUB_LUMA_MB(u4_blk_id,i4_offset_x,i4_offset_y)                      \
 {                                                                               \
      i4_offset_x = (u4_blk_id % 4) << 2;                                        \
      i4_offset_y = (u4_blk_id / 4) << 2;                                        \
 }

 #define IND2SUB_CHROMA_MB(u4_blk_id,i4_offset_x,i4_offset_y)                   \
 {                                                                              \
      i4_offset_x = ((u4_blk_id & 0x1 ) << 3) + (u4_blk_id > 3);                \
      i4_offset_y = (u4_blk_id & 0x2) << 1;                                     \
 }


 /*****************************************************************************/
 /* Function Declarations                                                     */
 /*****************************************************************************/

 /**
 *******************************************************************************
 *
 * @brief
 *  This function performs does the DCT transform then Hadamard transform
 *  and quantization for a macroblock when the mb mode is intra 16x16 mode
 *
 * @par Description:
 *  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
 *  Then hadamard transform is done on the DC coefficients
 *  Quantization is then performed on the 16x16 block, 4x4 wise
 *
 * @param[in] pu1_src
 *  Pointer to source sub-block
 *
 * @param[in] pu1_pred
 *  Pointer to prediction sub-block
 *
 * @param[in] pi2_out
 *  Pointer to residual sub-block
 *  The output will be in linear format
 *  The first 16 continuous locations will contain the values of Dc block
 *  After DC block and a stride 1st AC block will follow
 *  After one more stride next AC block will follow
 *  The blocks will be in raster scan order
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] pred_strd
 *  Prediction stride
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] pu2_scale_matrix
 *  The quantization matrix for 4x4 transform
 *
 * @param[in] pu2_threshold_matrix
 *  Threshold matrix
 *
 * @param[in] u4_qbits
 *  15+QP/6
 *
 * @param[in] u4_round_factor
 *  Round factor for quant
 *
 * @param[out] pu1_nnz
 *  Memory to store the non-zeros after transform
 *  The first byte will be the nnz of DC block
 *  From the next byte the AC nnzs will be stored in raster scan order
 *
 * @param u4_dc_flag
 *  Signals if Dc transform is to be done or not
 *   1 -> Dc transform will be done
 *   0 -> Dc transform will not be done
 *
 * @remarks
 *
 *******************************************************************************
 */
 void ih264e_luma_16x16_resi_trans_dctrans_quant(
                 codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred,
                 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
                 WORD32 dst_strd, const UWORD16 *pu2_scale_matrix,
                 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
                 UWORD32 u4_round_factor, UWORD8 *pu1_nnz, UWORD32 u4_dc_flag);

 /**
 *******************************************************************************
 *
 * @brief
 *  This function performs the intra 16x16 inverse transform process for H264
 *  it includes inverse Dc transform, inverse quant and then inverse transform
 *
 * @par Description:
 *
 * @param[in] pi2_src
 *  Input data, 16x16 size
 *  First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
 *  after a stride 1st AC clock will be present again in raster can order
 *  Then each AC block of the 16x16 block will follow in raster scan order
 *
 * @param[in] pu1_pred
 *  The predicted data, 16x16 size
 *  Block by block form
 *
 * @param[in] pu1_out
 *  Output 16x16
 *  In block by block form
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] pred_strd
 *  input stride for prediction buffer
 *
 * @param[in] out_strd
 *  input stride for output buffer
 *
 * @param[in] pu2_iscale_mat
 *  Inverse quantization matrix for 4x4 transform
 *
 * @param[in] pu2_weigh_mat
 *  weight matrix of 4x4 transform
 *
 * @param[in] qp_div
 *  QP/6
 *
 * @param[in] pi4_tmp
 *  Input temporary buffer
 *  needs to be at least 20 in size
 *
 * @param[in] pu4_cntrl
 *  Controls the transform path
 *  total Last 17 bits are used
 *  the 16th th bit will correspond to DC block
 *  and 32-17 will correspond to the ac blocks in raster scan order
 *  bit equaling zero indicates that the entire 4x4 block is zero for DC
 *  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
 *
 * @param[in] pi4_tmp
 *  Input temporary buffer
 *  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
 *
 * @returns
 *  none
 *
 * @remarks
 *  The all zero case must be taken care outside
 *
 *******************************************************************************
 */
 void ih264e_luma_16x16_idctrans_iquant_itrans_recon(
                 codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred,
                 UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
                 WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
                 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
                 UWORD32 u4_dc_trans_flag, WORD32 *pi4_tmp);

 /**
 *******************************************************************************
 *
 * @brief
 *  This function performs does the DCT transform then Hadamard transform
 *  and quantization for a chroma macroblock
 *
 * @par Description:
 *  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
 *  Then hadamard transform is done on the DC coefficients
 *  Quantization is then performed on the 8x8 block, 4x4 wise
 *
 * @param[in] pu1_src
 *  Pointer to source sub-block
 *  The input is in interleaved format for two chroma planes
 *
 * @param[in] pu1_pred
 *  Pointer to prediction sub-block
 *  Prediction is in inter leaved format
 *
 * @param[in] pi2_out
 *  Pointer to residual sub-block
 *  The output will be in linear format
 *  The first 4 continuous locations will contain the values of DC block for U
 *  and then next 4 will contain for V.
 *  After DC block and a stride 1st AC block of U plane will follow
 *  After one more stride next AC block of V plane will follow
 *  The blocks will be in raster scan order
 *
 *  After all the AC blocks of U plane AC blocks of V plane will follow in exact
 *  same way
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] pred_strd
 *  Prediction stride
 *
 * @param[in] dst_strd
 *  Destination stride
 *
 * @param[in] pu2_scale_matrix
 *  The quantization matrix for 4x4 transform
 *
 * @param[in] pu2_threshold_matrix
 *  Threshold matrix
 *
 * @param[in] u4_qbits
 *  15+QP/6
 *
 * @param[in] u4_round_factor
 *  Round factor for quant
 *
 * @param[out] pu1_nnz
 *  Memory to store the non-zeros after transform
 *  The first byte will be the nnz od DC block for U plane
 *  From the next byte the AC nnzs will be storerd in raster scan order
 *  The fifth byte will be nnz of Dc block of V plane
 *  Then Ac blocks will follow
 *
 * @param u4_dc_flag
 *  Signals if Dc transform is to be done or not
 *   1 -> Dc transform will be done
 *   0 -> Dc transform will not be done
 *
 * @remarks
 *
 *******************************************************************************
 */
 void ih264e_chroma_8x8_resi_trans_dctrans_quant(
                 codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred,
                 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
                 WORD32 out_strd, const UWORD16 *pu2_scale_matrix,
                 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
                 UWORD32 u4_round_factor, UWORD8 *pu1_nnz_c);

 /**
 *******************************************************************************
 * @brief
 *  This function performs the inverse transform with process for chroma MB of H264
 *
 * @par Description:
 *  Does inverse DC transform ,inverse quantization inverse transform
 *
 * @param[in] pi2_src
 *  Input data, 16x16 size
 *  The input is in the form of, first 4 locations will contain DC coeffs of
 *  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
 *  in raster scan order will follow, each block as linear array in raster scan order.
 *  After a stride next AC block will follow. After all AC blocks of U plane
 *  V plane AC blocks will follow in exact same order.
 *
 * @param[in] pu1_pred
 *  The predicted data, 8x16 size, U and V interleaved
 *
 * @param[in] pu1_out
 *  Output 8x16, U and V interleaved
 *
 * @param[in] src_strd
 *  Source stride
 *
 * @param[in] pred_strd
 *  input stride for prediction buffer
 *
 * @param[in] out_strd
 *  input stride for output buffer
 *
 * @param[in] pu2_iscale_mat
 *  Inverse quantization martix for 4x4 transform
 *
 * @param[in] pu2_weigh_mat
 *  weight matrix of 4x4 transform
 *
 * @param[in] qp_div
 *  QP/6
 *
 * @param[in] pi4_tmp
 *  Input temporary buffer
 *  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
 *  in size
 *
 * @param[in] pu4_cntrl
 *  Controls the transform path
 *  the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
 *  32-28 bits will indicate AC blocks of U plane in raster scan order
 *  27-23 bits will indicate AC blocks of V plane in rater scan order
 *  The bit 1 implies that there is at least one non zero coff in a block
 *
 * @returns
 *  none
 *
 * @remarks
 *******************************************************************************
 */
 void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
                 codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred,
                 UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
                 WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
                 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
                 WORD32 *pi4_tmp);

 /**
 ******************************************************************************
 *
 * @brief  This function packs residue of an i16x16 luma mb for entropy coding
 *
 * @par   Description
 *  An i16 macro block contains two classes of units, dc 4x4 block and
 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
 *  the 16 ac blocks are sent next in scan order. Each and every block is
 *  represented by 3 parameters (nnz, significant coefficient map and the
 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
 *  sent in scan order.
 *
 *  The first byte of each block will be nnz of the block, if it is non zero,
 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
 *  This is repeated for 1 dc + 16 ac blocks.
 *
 * @param[in]  pi2_res_mb
 *  pointer to residue mb
 *
 * @param[in, out]  pv_mb_coeff_data
 *  buffer pointing to packed residue coefficients
 *
 * @param[in]  u4_res_strd
 *  residual block stride
 *
 * @param[out]  u1_cbp_l
 *  coded block pattern luma
 *
 * @param[in]   pu1_nnz
 *  number of non zero coefficients in each 4x4 unit
 *
 * @param[out]
 *  Control signal for inverse transform of 16x16 blocks
 *
 * @return none
 *
 * @ remarks
 *
 ******************************************************************************
 */
 void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb, void **pv_mb_coeff_data,
                           WORD32 i4_res_strd, UWORD8 *u1_cbp_l, UWORD8 *pu1_nnz,
                           UWORD32 *pu4_cntrl);

 /**
 ******************************************************************************
 *
 * @brief  This function packs residue of an i8x8 chroma mb for entropy coding
 *
 * @par   Description
 *  An i8 chroma macro block contains two classes of units, dc 2x2 block and
 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
 *  the 4 ac blocks are sent next in scan order. Each and every block is
 *  represented by 3 parameters (nnz, significant coefficient map and the
 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
 *  sent in scan order.
 *
 *  The first byte of each block will be nnz of the block, if it is non zero,
 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
 *  This is repeated for 1 dc + 4 ac blocks.
 *
 * @param[in]  pi2_res_mb
 *  pointer to residue mb
 *
 * @param[in, out]  pv_mb_coeff_data
 *  buffer pointing to packed residue coefficients
 *
 * @param[in]  u4_res_strd
 *  residual block stride
 *
 * @param[out]  u1_cbp_c
 *  coded block pattern chroma
 *
 * @param[in]   pu1_nnz
 *  number of non zero coefficients in each 4x4 unit
 *
 * @param[out]   pu1_nnz
 *  Control signal for inverse transform
 *
 * @param[in]   u4_swap_uv
 *  Swaps the order of U and V planes in entropy bitstream
 *
 * @return none
 *
 * @ remarks
 *
 ******************************************************************************
 */
 void ih264e_pack_c_mb(WORD16 *pi2_res_mb, void **pv_mb_coeff_data,
                       WORD32 i4_res_strd, UWORD8 *u1_cbp_c, UWORD8 *pu1_nnz,
                       UWORD32 u4_kill_coffs_flag, UWORD32 *pu4_cntrl,
                       UWORD32 u4_swap_uv);

 /**
 *******************************************************************************
 *
 * @brief performs luma core coding when intra mode is i16x16
 *
 * @par Description:
 *  If the current mb is to be coded as intra of mb type i16x16, the mb is first
 *  predicted using one of i16x16 prediction filters, basing on the intra mode
 *  chosen. Then, error is computed between the input blk and the estimated blk.
 *  This error is transformed (hierarchical transform i.e., dct followed by hada-
 *  -mard), quantized. The quantized coefficients are packed in scan order for
 *  entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_l
 *  coded block pattern luma
 *
 * @remarks none
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_luma_intra_macroblock_16x16
         (
             process_ctxt_t *ps_proc
         );

 /**
 *******************************************************************************
 *
 * @brief performs luma core coding when intra mode is i4x4
 *
 * @par Description:
 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
 *  predicted using one of i4x4 prediction filters, basing on the intra mode
 *  chosen. Then, error is computed between the input blk and the estimated blk.
 *  This error is dct transformed and quantized. The quantized coefficients are
 *  packed in scan order for entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_l
 *  coded block pattern luma
 *
 * @remarks
 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
 *  mentioned in h.264 specification
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_luma_intra_macroblock_4x4
         (
             process_ctxt_t *ps_proc
         );

 /**
 *******************************************************************************
 *
 * @brief performs luma core coding when intra mode is i4x4
 *
 * @par Description:
 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
 *  predicted using one of i4x4 prediction filters, basing on the intra mode
 *  chosen. Then, error is computed between the input blk and the estimated blk.
 *  This error is dct transformed and quantized. The quantized coefficients are
 *  packed in scan order for entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_l
 *  coded block pattern luma
 *
 * @remarks
 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
 *  mentioned in h.264 specification
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on
         (
             process_ctxt_t *ps_proc
         );

 /**
 *******************************************************************************
 *
 * @brief performs chroma core coding for intra macro blocks
 *
 * @par Description:
 *  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
 *  first predicted using intra 8x8 prediction filters. The predicted data is
 *  compared with the input for error and the error is transformed. The DC
 *  coefficients of each transformed sub blocks are further transformed using
 *  Hadamard transform. The resulting coefficients are quantized, packed and sent
 *  for entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_c
 *  coded block pattern chroma
 *
 * @remarks
 *  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
 *  mentioned in h.264 specification
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_chroma_intra_macroblock_8x8
         (
             process_ctxt_t *ps_proc
         );

 /**
 *******************************************************************************
 * @brief performs luma core coding when  mode is inter
 *
 * @par Description:
 *  If the current mb is to be coded as inter predicted mb,based on the sub mb
 *  partitions and corresponding motion vectors generated by ME, prediction is done.
 *  Then, error is computed between the input blk and the estimated blk.
 *  This error is transformed ( dct and with out hadamard), quantized. The
 *  quantized coefficients are packed in scan order for entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_l
 *  coded block pattern luma
 *
 * @remarks none
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_luma_inter_macroblock_16x16
         (
             process_ctxt_t *ps_proc
         );

 /**
 *******************************************************************************
 * @brief performs chroma core coding for inter macro blocks
 *
 * @par Description:
 *  If the current mb is to be coded as inter predicted mb, based on the sub mb
 *  partitions and corresponding motion vectors generated by ME, prediction is done.
 *  Then, error is computed between the input blk and the estimated blk.
 *  This error is transformed, quantized. The quantized coefficients
 *  are packed in scan order for entropy coding.
 *
 * @param[in] ps_proc_ctxt
 *  pointer to the current macro block context
 *
 * @returns u1_cbp_l
 *  coded block pattern luma
 *
 * @remarks none
 *
 *******************************************************************************
 */
 UWORD8 ih264e_code_chroma_inter_macroblock_8x8
         (
             process_ctxt_t *ps_proc
         );

 #endif /* IH264E_CORE_CODING_H_ */
	/******************************************************************************
	*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*****************************************************************************
	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	*/

	/**
	******************************************************************************
	* @file
	* ih264e_core_coding.h
	*
	* @brief
	* This file contains extern declarations of core coding routines
	*
	* @author
	* ittiam
	*
	* @remarks
	* none
	******************************************************************************
	*/

	#ifndef IH264E_CORE_CODING_H_
	#define IH264E_CORE_CODING_H_

	/*****************************************************************************/
	/* Constant Macros */
	/*****************************************************************************/

	/**
	******************************************************************************
	* @brief Enable/Disable Hadamard transform of DC Coeff's
	******************************************************************************
	*/
	#define DISABLE_DC_TRANSFORM 0
	#define ENABLE_DC_TRANSFORM 1

	/**
	*******************************************************************************
	* @brief bit masks for DC and AC control flags
	*******************************************************************************
	*/

	#define DC_COEFF_CNT_LUMA_MB 16
	#define NUM_4X4_BLKS_LUMA_MB_ROW 4
	#define NUM_LUMA4x4_BLOCKS_IN_MB 16
	#define NUM_CHROMA4x4_BLOCKS_IN_MB 8

	#define SIZE_4X4_BLK_HRZ TRANS_SIZE_4
	#define SIZE_4X4_BLK_VERT TRANS_SIZE_4

	#define CNTRL_FLAG_DC_MASK_LUMA 0x0000FFFF
	#define CNTRL_FLAG_AC_MASK_LUMA 0xFFFF0000

	#define CNTRL_FLAG_AC_MASK_CHROMA_U 0xF0000000
	#define CNTRL_FLAG_DC_MASK_CHROMA_U 0x0000F000

	#define CNTRL_FLAG_AC_MASK_CHROMA_V 0x0F000000
	#define CNTRL_FLAG_DC_MASK_CHROMA_V 0x00000F00

	#define CNTRL_FLAG_AC_MASK_CHROMA ( CNTRL_FLAG_AC_MASK_CHROMA_U \| CNTRL_FLAG_AC_MASK_CHROMA_V )
	#define CNTRL_FLAG_DC_MASK_CHROMA ( CNTRL_FLAG_DC_MASK_CHROMA_U \| CNTRL_FLAG_DC_MASK_CHROMA_V )

	#define CNTRL_FLAG_DCBLK_MASK_CHROMA 0x0000C000

	/**
	*******************************************************************************
	* @brief macros for transforms
	*******************************************************************************
	*/
	#define DEQUEUE_BLKID_FROM_CONTROL( u4_cntrl, blk_lin_id) \
	{ \
	blk_lin_id = CLZ(u4_cntrl); \
	u4_cntrl &= (0x7FFFFFFF >> blk_lin_id); \
	};

	#define IND2SUB_LUMA_MB(u4_blk_id,i4_offset_x,i4_offset_y) \
	{ \
	i4_offset_x = (u4_blk_id % 4) << 2; \
	i4_offset_y = (u4_blk_id / 4) << 2; \
	}

	#define IND2SUB_CHROMA_MB(u4_blk_id,i4_offset_x,i4_offset_y) \
	{ \
	i4_offset_x = ((u4_blk_id & 0x1 ) << 3) + (u4_blk_id > 3); \
	i4_offset_y = (u4_blk_id & 0x2) << 1; \
	}


	/*****************************************************************************/
	/* Function Declarations */
	/*****************************************************************************/

	/**
	*******************************************************************************
	*
	* @brief
	* This function performs does the DCT transform then Hadamard transform
	* and quantization for a macroblock when the mb mode is intra 16x16 mode
	*
	* @par Description:
	* First cf4 is done on all 16 4x4 blocks of the 16x16 input block.
	* Then hadamard transform is done on the DC coefficients
	* Quantization is then performed on the 16x16 block, 4x4 wise
	*
	* @param[in] pu1_src
	* Pointer to source sub-block
	*
	* @param[in] pu1_pred
	* Pointer to prediction sub-block
	*
	* @param[in] pi2_out
	* Pointer to residual sub-block
	* The output will be in linear format
	* The first 16 continuous locations will contain the values of Dc block
	* After DC block and a stride 1st AC block will follow
	* After one more stride next AC block will follow
	* The blocks will be in raster scan order
	*
	* @param[in] src_strd
	* Source stride
	*
	* @param[in] pred_strd
	* Prediction stride
	*
	* @param[in] dst_strd
	* Destination stride
	*
	* @param[in] pu2_scale_matrix
	* The quantization matrix for 4x4 transform
	*
	* @param[in] pu2_threshold_matrix
	* Threshold matrix
	*
	* @param[in] u4_qbits
	* 15+QP/6
	*
	* @param[in] u4_round_factor
	* Round factor for quant
	*
	* @param[out] pu1_nnz
	* Memory to store the non-zeros after transform
	* The first byte will be the nnz of DC block
	* From the next byte the AC nnzs will be stored in raster scan order
	*
	* @param u4_dc_flag
	* Signals if Dc transform is to be done or not
	* 1 -> Dc transform will be done
	* 0 -> Dc transform will not be done
	*
	* @remarks
	*
	*******************************************************************************
	*/
	void ih264e_luma_16x16_resi_trans_dctrans_quant(
	codec_t ps_codec, UWORD8 pu1_src, UWORD8 *pu1_pred,
	WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
	WORD32 dst_strd, const UWORD16 *pu2_scale_matrix,
	const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
	UWORD32 u4_round_factor, UWORD8 *pu1_nnz, UWORD32 u4_dc_flag);

	/**
	*******************************************************************************
	*
	* @brief
	* This function performs the intra 16x16 inverse transform process for H264
	* it includes inverse Dc transform, inverse quant and then inverse transform
	*
	* @par Description:
	*
	* @param[in] pi2_src
	* Input data, 16x16 size
	* First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
	* after a stride 1st AC clock will be present again in raster can order
	* Then each AC block of the 16x16 block will follow in raster scan order
	*
	* @param[in] pu1_pred
	* The predicted data, 16x16 size
	* Block by block form
	*
	* @param[in] pu1_out
	* Output 16x16
	* In block by block form
	*
	* @param[in] src_strd
	* Source stride
	*
	* @param[in] pred_strd
	* input stride for prediction buffer
	*
	* @param[in] out_strd
	* input stride for output buffer
	*
	* @param[in] pu2_iscale_mat
	* Inverse quantization matrix for 4x4 transform
	*
	* @param[in] pu2_weigh_mat
	* weight matrix of 4x4 transform
	*
	* @param[in] qp_div
	* QP/6
	*
	* @param[in] pi4_tmp
	* Input temporary buffer
	* needs to be at least 20 in size
	*
	* @param[in] pu4_cntrl
	* Controls the transform path
	* total Last 17 bits are used
	* the 16th th bit will correspond to DC block
	* and 32-17 will correspond to the ac blocks in raster scan order
	* bit equaling zero indicates that the entire 4x4 block is zero for DC
	* For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
	*
	* @param[in] pi4_tmp
	* Input temporary buffer
	* needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
	*
	* @returns
	* none
	*
	* @remarks
	* The all zero case must be taken care outside
	*
	*******************************************************************************
	*/
	void ih264e_luma_16x16_idctrans_iquant_itrans_recon(
	codec_t ps_codec, WORD16 pi2_src, UWORD8 *pu1_pred,
	UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
	WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
	const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
	UWORD32 u4_dc_trans_flag, WORD32 *pi4_tmp);

	/**
	*******************************************************************************
	*
	* @brief
	* This function performs does the DCT transform then Hadamard transform
	* and quantization for a chroma macroblock
	*
	* @par Description:
	* First cf4 is done on all 16 4x4 blocks of the 8x8input block
	* Then hadamard transform is done on the DC coefficients
	* Quantization is then performed on the 8x8 block, 4x4 wise
	*
	* @param[in] pu1_src
	* Pointer to source sub-block
	* The input is in interleaved format for two chroma planes
	*
	* @param[in] pu1_pred
	* Pointer to prediction sub-block
	* Prediction is in inter leaved format
	*
	* @param[in] pi2_out
	* Pointer to residual sub-block
	* The output will be in linear format
	* The first 4 continuous locations will contain the values of DC block for U
	* and then next 4 will contain for V.
	* After DC block and a stride 1st AC block of U plane will follow
	* After one more stride next AC block of V plane will follow
	* The blocks will be in raster scan order
	*
	* After all the AC blocks of U plane AC blocks of V plane will follow in exact
	* same way
	*
	* @param[in] src_strd
	* Source stride
	*
	* @param[in] pred_strd
	* Prediction stride
	*
	* @param[in] dst_strd
	* Destination stride
	*
	* @param[in] pu2_scale_matrix
	* The quantization matrix for 4x4 transform
	*
	* @param[in] pu2_threshold_matrix
	* Threshold matrix
	*
	* @param[in] u4_qbits
	* 15+QP/6
	*
	* @param[in] u4_round_factor
	* Round factor for quant
	*
	* @param[out] pu1_nnz
	* Memory to store the non-zeros after transform
	* The first byte will be the nnz od DC block for U plane
	* From the next byte the AC nnzs will be storerd in raster scan order
	* The fifth byte will be nnz of Dc block of V plane
	* Then Ac blocks will follow
	*
	* @param u4_dc_flag
	* Signals if Dc transform is to be done or not
	* 1 -> Dc transform will be done
	* 0 -> Dc transform will not be done
	*
	* @remarks
	*
	*******************************************************************************
	*/
	void ih264e_chroma_8x8_resi_trans_dctrans_quant(
	codec_t ps_codec, UWORD8 pu1_src, UWORD8 *pu1_pred,
	WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
	WORD32 out_strd, const UWORD16 *pu2_scale_matrix,
	const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
	UWORD32 u4_round_factor, UWORD8 *pu1_nnz_c);

	/**
	*******************************************************************************
	* @brief
	* This function performs the inverse transform with process for chroma MB of H264
	*
	* @par Description:
	* Does inverse DC transform ,inverse quantization inverse transform
	*
	* @param[in] pi2_src
	* Input data, 16x16 size
	* The input is in the form of, first 4 locations will contain DC coeffs of
	* U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
	* in raster scan order will follow, each block as linear array in raster scan order.
	* After a stride next AC block will follow. After all AC blocks of U plane
	* V plane AC blocks will follow in exact same order.
	*
	* @param[in] pu1_pred
	* The predicted data, 8x16 size, U and V interleaved
	*
	* @param[in] pu1_out
	* Output 8x16, U and V interleaved
	*
	* @param[in] src_strd
	* Source stride
	*
	* @param[in] pred_strd
	* input stride for prediction buffer
	*
	* @param[in] out_strd
	* input stride for output buffer
	*
	* @param[in] pu2_iscale_mat
	* Inverse quantization martix for 4x4 transform
	*
	* @param[in] pu2_weigh_mat
	* weight matrix of 4x4 transform
	*
	* @param[in] qp_div
	* QP/6
	*
	* @param[in] pi4_tmp
	* Input temporary buffer
	* needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
	* in size
	*
	* @param[in] pu4_cntrl
	* Controls the transform path
	* the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
	* 32-28 bits will indicate AC blocks of U plane in raster scan order
	* 27-23 bits will indicate AC blocks of V plane in rater scan order
	* The bit 1 implies that there is at least one non zero coff in a block
	*
	* @returns
	* none
	*
	* @remarks
	*******************************************************************************
	*/
	void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
	codec_t ps_codec, WORD16 pi2_src, UWORD8 *pu1_pred,
	UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd,
	WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
	const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl,
	WORD32 *pi4_tmp);

	/**
	******************************************************************************
	*
	* @brief This function packs residue of an i16x16 luma mb for entropy coding
	*
	* @par Description
	* An i16 macro block contains two classes of units, dc 4x4 block and
	* 4x4 ac blocks. while packing the mb, the dc block is sent first, and
	* the 16 ac blocks are sent next in scan order. Each and every block is
	* represented by 3 parameters (nnz, significant coefficient map and the
	* residue coefficients itself). If a 4x4 unit does not have any coefficients
	* then only nnz is sent. Inside a 4x4 block the individual coefficients are
	* sent in scan order.
	*
	* The first byte of each block will be nnz of the block, if it is non zero,
	* a 2 byte significance map is sent. This is followed by nonzero coefficients.
	* This is repeated for 1 dc + 16 ac blocks.
	*
	* @param[in] pi2_res_mb
	* pointer to residue mb
	*
	* @param[in, out] pv_mb_coeff_data
	* buffer pointing to packed residue coefficients
	*
	* @param[in] u4_res_strd
	* residual block stride
	*
	* @param[out] u1_cbp_l
	* coded block pattern luma
	*
	* @param[in] pu1_nnz
	* number of non zero coefficients in each 4x4 unit
	*
	* @param[out]
	* Control signal for inverse transform of 16x16 blocks
	*
	* @return none
	*
	* @ remarks
	*
	******************************************************************************
	*/
	void ih264e_pack_l_mb_i16(WORD16 pi2_res_mb, void *pv_mb_coeff_data,
	WORD32 i4_res_strd, UWORD8 u1_cbp_l, UWORD8 pu1_nnz,
	UWORD32 *pu4_cntrl);

	/**
	******************************************************************************
	*
	* @brief This function packs residue of an i8x8 chroma mb for entropy coding
	*
	* @par Description
	* An i8 chroma macro block contains two classes of units, dc 2x2 block and
	* 4x4 ac blocks. while packing the mb, the dc block is sent first, and
	* the 4 ac blocks are sent next in scan order. Each and every block is
	* represented by 3 parameters (nnz, significant coefficient map and the
	* residue coefficients itself). If a 4x4 unit does not have any coefficients
	* then only nnz is sent. Inside a 4x4 block the individual coefficients are
	* sent in scan order.
	*
	* The first byte of each block will be nnz of the block, if it is non zero,
	* a 2 byte significance map is sent. This is followed by nonzero coefficients.
	* This is repeated for 1 dc + 4 ac blocks.
	*
	* @param[in] pi2_res_mb
	* pointer to residue mb
	*
	* @param[in, out] pv_mb_coeff_data
	* buffer pointing to packed residue coefficients
	*
	* @param[in] u4_res_strd
	* residual block stride
	*
	* @param[out] u1_cbp_c
	* coded block pattern chroma
	*
	* @param[in] pu1_nnz
	* number of non zero coefficients in each 4x4 unit
	*
	* @param[out] pu1_nnz
	* Control signal for inverse transform
	*
	* @param[in] u4_swap_uv
	* Swaps the order of U and V planes in entropy bitstream
	*
	* @return none
	*
	* @ remarks
	*
	******************************************************************************
	*/
	void ih264e_pack_c_mb(WORD16 pi2_res_mb, void *pv_mb_coeff_data,
	WORD32 i4_res_strd, UWORD8 u1_cbp_c, UWORD8 pu1_nnz,
	UWORD32 u4_kill_coffs_flag, UWORD32 *pu4_cntrl,
	UWORD32 u4_swap_uv);

	/**
	*******************************************************************************
	*
	* @brief performs luma core coding when intra mode is i16x16
	*
	* @par Description:
	* If the current mb is to be coded as intra of mb type i16x16, the mb is first
	* predicted using one of i16x16 prediction filters, basing on the intra mode
	* chosen. Then, error is computed between the input blk and the estimated blk.
	* This error is transformed (hierarchical transform i.e., dct followed by hada-
	* -mard), quantized. The quantized coefficients are packed in scan order for
	* entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_l
	* coded block pattern luma
	*
	* @remarks none
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_luma_intra_macroblock_16x16
	(
	process_ctxt_t *ps_proc
	);

	/**
	*******************************************************************************
	*
	* @brief performs luma core coding when intra mode is i4x4
	*
	* @par Description:
	* If the current mb is to be coded as intra of mb type i4x4, the mb is first
	* predicted using one of i4x4 prediction filters, basing on the intra mode
	* chosen. Then, error is computed between the input blk and the estimated blk.
	* This error is dct transformed and quantized. The quantized coefficients are
	* packed in scan order for entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_l
	* coded block pattern luma
	*
	* @remarks
	* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
	* mentioned in h.264 specification
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_luma_intra_macroblock_4x4
	(
	process_ctxt_t *ps_proc
	);

	/**
	*******************************************************************************
	*
	* @brief performs luma core coding when intra mode is i4x4
	*
	* @par Description:
	* If the current mb is to be coded as intra of mb type i4x4, the mb is first
	* predicted using one of i4x4 prediction filters, basing on the intra mode
	* chosen. Then, error is computed between the input blk and the estimated blk.
	* This error is dct transformed and quantized. The quantized coefficients are
	* packed in scan order for entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_l
	* coded block pattern luma
	*
	* @remarks
	* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
	* mentioned in h.264 specification
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on
	(
	process_ctxt_t *ps_proc
	);

	/**
	*******************************************************************************
	*
	* @brief performs chroma core coding for intra macro blocks
	*
	* @par Description:
	* If the current MB is to be intra coded with mb type chroma I8x8, the MB is
	* first predicted using intra 8x8 prediction filters. The predicted data is
	* compared with the input for error and the error is transformed. The DC
	* coefficients of each transformed sub blocks are further transformed using
	* Hadamard transform. The resulting coefficients are quantized, packed and sent
	* for entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_c
	* coded block pattern chroma
	*
	* @remarks
	* The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
	* mentioned in h.264 specification
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_chroma_intra_macroblock_8x8
	(
	process_ctxt_t *ps_proc
	);

	/**
	*******************************************************************************
	* @brief performs luma core coding when mode is inter
	*
	* @par Description:
	* If the current mb is to be coded as inter predicted mb,based on the sub mb
	* partitions and corresponding motion vectors generated by ME, prediction is done.
	* Then, error is computed between the input blk and the estimated blk.
	* This error is transformed ( dct and with out hadamard), quantized. The
	* quantized coefficients are packed in scan order for entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_l
	* coded block pattern luma
	*
	* @remarks none
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_luma_inter_macroblock_16x16
	(
	process_ctxt_t *ps_proc
	);

	/**
	*******************************************************************************
	* @brief performs chroma core coding for inter macro blocks
	*
	* @par Description:
	* If the current mb is to be coded as inter predicted mb, based on the sub mb
	* partitions and corresponding motion vectors generated by ME, prediction is done.
	* Then, error is computed between the input blk and the estimated blk.
	* This error is transformed, quantized. The quantized coefficients
	* are packed in scan order for entropy coding.
	*
	* @param[in] ps_proc_ctxt
	* pointer to the current macro block context
	*
	* @returns u1_cbp_l
	* coded block pattern luma
	*
	* @remarks none
	*
	*******************************************************************************
	*/
	UWORD8 ih264e_code_chroma_inter_macroblock_8x8
	(
	process_ctxt_t *ps_proc
	);

	#endif /* IH264E_CORE_CODING_H_ */