encoder/ihevce_had_satd.c - platform/external/libhevc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */

 /**
 ******************************************************************************
 * @file ihevce_had_satd.c
 *
 * @brief
 *    This file contains functions of Hadamard SAD and SATD
 *
 * @author
 *    Ittiam
 *
 * List of Functions
 *   <TODO: TO BE ADDED>
 *
 ******************************************************************************
 */

 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/
 /* System include files */
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <stdarg.h>
 #include <math.h>

 /* User include files */
 #include "ihevc_typedefs.h"
 #include "itt_video_api.h"
 #include "ihevce_api.h"

 #include "rc_cntrl_param.h"
 #include "rc_frame_info_collector.h"
 #include "rc_look_ahead_params.h"

 #include "ihevc_defs.h"
 #include "ihevc_structs.h"
 #include "ihevc_platform_macros.h"
 #include "ihevc_deblk.h"
 #include "ihevc_itrans_recon.h"
 #include "ihevc_chroma_itrans_recon.h"
 #include "ihevc_chroma_intra_pred.h"
 #include "ihevc_intra_pred.h"
 #include "ihevc_inter_pred.h"
 #include "ihevc_mem_fns.h"
 #include "ihevc_padding.h"
 #include "ihevc_weighted_pred.h"
 #include "ihevc_sao.h"
 #include "ihevc_resi_trans.h"
 #include "ihevc_quant_iquant_ssd.h"
 #include "ihevc_cabac_tables.h"

 #include "ihevce_defs.h"
 #include "ihevce_lap_enc_structs.h"
 #include "ihevce_multi_thrd_structs.h"
 #include "ihevce_multi_thrd_funcs.h"
 #include "ihevce_me_common_defs.h"
 #include "ihevce_had_satd.h"
 #include "ihevce_error_codes.h"
 #include "ihevce_bitstream.h"
 #include "ihevce_cabac.h"
 #include "ihevce_rdoq_macros.h"
 #include "ihevce_function_selector.h"
 #include "ihevce_enc_structs.h"
 #include "ihevce_cmn_utils_instr_set_router.h"
 #include "hme_datatype.h"
 #include "hme_interface.h"
 #include "hme_common_defs.h"
 #include "hme_defs.h"

 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/

 static void ihevce_hadamard_4x4_8bit(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     WORD16 m[16];

     /*===== hadamard horz transform =====*/
     for(k = 0; k < 4; k++)
     {
         WORD32 r0, r1, r2, r3;
         WORD32 h0, h1, h2, h3;

         /* Compute the residue block */
         r0 = pu1_src[0] - pu1_pred[0];
         r1 = pu1_src[1] - pu1_pred[1];
         r2 = pu1_src[2] - pu1_pred[2];
         r3 = pu1_src[3] - pu1_pred[3];

         h0 = r0 + r1;
         h1 = r0 - r1;
         h2 = r2 + r3;
         h3 = r2 - r3;

         m[k * 4 + 0] = h0 + h2;
         m[k * 4 + 1] = h1 + h3;
         m[k * 4 + 2] = h0 - h2;
         m[k * 4 + 3] = h1 - h3;

         pu1_pred += pred_strd;
         pu1_src += src_strd;
     }

     /*===== hadamard vert transform =====*/
     for(k = 0; k < 4; k++)
     {
         WORD32 v0, v1, v2, v3;

         v0 = m[0 + k] + m[4 + k];
         v1 = m[0 + k] - m[4 + k];
         v2 = m[8 + k] + m[12 + k];
         v3 = m[8 + k] - m[12 + k];

         pi2_dst[0 * dst_strd + k] = v0 + v2;
         pi2_dst[1 * dst_strd + k] = v1 + v3;
         pi2_dst[2 * dst_strd + k] = v0 - v2;
         pi2_dst[3 * dst_strd + k] = v1 - v3;
     }
 }

 static void ihevce_hadamard_8x8_8bit(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 i;

     // y0
     ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
     // y1
     ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
     // y2
     ihevce_hadamard_4x4_8bit(
         pu1_src + 4 * src_strd,
         src_strd,
         pu1_pred + 4 * pred_strd,
         pred_strd,
         pi2_dst + (4 * dst_strd),
         dst_strd);
     // y3
     ihevce_hadamard_4x4_8bit(
         pu1_src + 4 + 4 * src_strd,
         src_strd,
         pu1_pred + 4 + 4 * pred_strd,
         pred_strd,
         pi2_dst + (4 * dst_strd) + 4,
         dst_strd);

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 16; i++)
     {
         WORD32 idx = (i >> 2) * dst_strd + (i % 4);
         WORD16 a0 = pi2_dst[idx];
         WORD16 a1 = pi2_dst[4 + idx];
         WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
         WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];

         WORD16 b0 = (a0 + a1);
         WORD16 b1 = (a0 - a1);
         WORD16 b2 = (a2 + a3);
         WORD16 b3 = (a2 - a3);

         pi2_dst[idx] = b0 + b2;
         pi2_dst[4 + idx] = b1 + b3;
         pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
         pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
     }
 }

 static void ihevce_hadamard_16x16_8bit(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 i;

     // y0
     ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
     // y1
     ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
     // y2
     ihevce_hadamard_8x8_8bit(
         pu1_src + 8 * src_strd,
         src_strd,
         pu1_pred + 8 * pred_strd,
         pred_strd,
         pi2_dst + (8 * dst_strd),
         dst_strd);
     // y3
     ihevce_hadamard_8x8_8bit(
         pu1_src + 8 + 8 * src_strd,
         src_strd,
         pu1_pred + 8 + 8 * pred_strd,
         pred_strd,
         pi2_dst + (8 * dst_strd) + 8,
         dst_strd);

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 64; i++)
     {
         WORD32 idx = (i >> 3) * dst_strd + (i % 8);
         WORD16 a0 = pi2_dst[idx];
         WORD16 a1 = pi2_dst[8 + idx];
         WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
         WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];

         WORD16 b0 = (a0 + a1) >> 1;
         WORD16 b1 = (a0 - a1) >> 1;
         WORD16 b2 = (a2 + a3) >> 1;
         WORD16 b3 = (a2 - a3) >> 1;

         pi2_dst[idx] = b0 + b2;
         pi2_dst[8 + idx] = b1 + b3;
         pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
         pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
     }
 }

 static void ihevce_hadamard_32x32_8bit(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 i;

     // y0
     ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
     // y1
     ihevce_hadamard_16x16_8bit(
         pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
     // y2
     ihevce_hadamard_16x16_8bit(
         pu1_src + 16 * src_strd,
         src_strd,
         pu1_pred + 16 * pred_strd,
         pred_strd,
         pi2_dst + (16 * dst_strd),
         dst_strd);
     // y3
     ihevce_hadamard_16x16_8bit(
         pu1_src + 16 + 16 * src_strd,
         src_strd,
         pu1_pred + 16 + 16 * pred_strd,
         pred_strd,
         pi2_dst + (16 * dst_strd) + 16,
         dst_strd);

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 256; i++)
     {
         WORD32 idx = (i >> 4) * dst_strd + (i % 16);
         WORD16 a0 = pi2_dst[idx] >> 2;
         WORD16 a1 = pi2_dst[16 + idx] >> 2;
         WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
         WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;

         WORD16 b0 = (a0 + a1);
         WORD16 b1 = (a0 - a1);
         WORD16 b2 = (a2 + a3);
         WORD16 b3 = (a2 - a3);

         pi2_dst[idx] = b0 + b2;
         pi2_dst[16 + idx] = b1 + b3;
         pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
         pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
     }
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  Compute Hadamard sad for 4x4 block with 8-bit input
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred_buf
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[in] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[in] size
 *  WORD32 transform Block size
 *
 * @returns hadamard SAD
 *
 * @remarks
 *  Not updating the transform destination now. Only returning the SATD
 *
 *******************************************************************************
 */
 UWORD32 ihevce_HAD_4x4_8bit(
     UWORD8 *pu1_origin,
     WORD32 src_strd,
     UWORD8 *pu1_pred_buf,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     WORD16 v[16];
     UWORD32 u4_sad = 0;

     (void)pi2_dst;
     (void)dst_strd;
     ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);

     for(k = 0; k < 16; ++k)
         u4_sad += abs(v[k]);
     u4_sad = ((u4_sad + 2) >> 2);

     return u4_sad;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  Computes Hadamard Sad for 8x8 block with 8-bit input
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred_buf
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[in] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[in] size
 *  WORD32 transform Block size
 *
 * @returns Hadamard SAD
 *
 * @remarks
 *  Not updating the transform destination now. Only returning the SATD
 *
 *******************************************************************************
 */
 UWORD32 ihevce_HAD_8x8_8bit(
     UWORD8 *pu1_origin,
     WORD32 src_strd,
     UWORD8 *pu1_pred_buf,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     UWORD32 u4_sad = 0;
     WORD16 v[64];

     (void)pi2_dst;
     (void)dst_strd;
     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);

     for(k = 0; k < 64; ++k)
         u4_sad += abs(v[k]);
     u4_sad = ((u4_sad + 4) >> 3);

     return u4_sad;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred_buf
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[in] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[in] size
 *  WORD32 transform Block size
 *
 * @returns Hadamard SAD with DC Suppressed
 *
 * @remarks
 *  Not updating the transform destination now. Only returning the SATD
 *
 *******************************************************************************
 */
 UWORD32 ihevce_compute_ac_had_8x8_8bit(
     UWORD8 *pu1_origin,
     WORD32 src_strd,
     UWORD8 *pu1_pred_buf,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     UWORD32 u4_sad = 0;
     WORD16 v[64];

     (void)pi2_dst;
     (void)dst_strd;
     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);

     v[0] = 0;
     for(k = 0; k < 64; ++k)
         u4_sad += abs(v[k]);
     u4_sad = ((u4_sad + 4) >> 3);

     return u4_sad;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  Computes Hadamard Sad for 16x16 block with 8-bit input
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred_buf
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[in] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[in] size
 *  WORD32 transform Block size
 *
 * @returns Hadamard SAD
 *
 * @remarks
 *  Not updating the transform destination now. Only returning the SATD
 *
 *******************************************************************************
 */
 UWORD32 ihevce_HAD_16x16_8bit(
     UWORD8 *pu1_origin,
     WORD32 src_strd,
     UWORD8 *pu1_pred_buf,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     UWORD32 u4_sad = 0;
     WORD16 v[256];

     (void)pi2_dst;
     (void)dst_strd;
     ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);

     for(k = 0; k < 256; ++k)
         u4_sad += abs(v[k]);
     u4_sad = ((u4_sad + 4) >> 3);

     return u4_sad;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *  Computes Hadamard Sad for 32x32 block with 8-bit input
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred_buf
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[in] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[in] size
 *  WORD32 transform Block size
 *
 * @returns Hadamard SAD
 *
 * @remarks
 *  Not updating the transform destination now. Only returning the SATD
 *
 *******************************************************************************
 */
 UWORD32 ihevce_HAD_32x32_8bit(
     UWORD8 *pu1_origin,
     WORD32 src_strd,
     UWORD8 *pu1_pred_buf,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd)
 {
     WORD32 k;
     UWORD32 u4_sad = 0;
     WORD16 v[32 * 32];

     (void)pi2_dst;
     (void)dst_strd;
     ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);

     for(k = 0; k < 32 * 32; ++k)
         u4_sad += abs(v[k]);
     u4_sad = ((u4_sad + 2) >> 2);

     return u4_sad;
 }

 //#if COMPUTE_16x16_R == C
 /**
 *******************************************************************************
 *
 * @brief
 *   Computes 8x8 transform using children 4x4 hadamard results
 *
 * @par Description:
 *
 * @param[in] pi2_4x4_had
 *  WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
 *
 * @param[in] had4_strd
 *  stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
 *
 * @param[out] pi2_dst
 *  destination buffer where 8x8 hadamard result is stored
 *
 * @param[in] dst_stride
 *  stride of destination block
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *  8x8 Hadamard SATD
 * @remarks
 *
 *******************************************************************************
 */
 static UWORD32 ihevce_compute_8x8HAD_using_4x4(
     WORD16 *pi2_4x4_had,
     WORD32 had4_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 i4_frm_qstep,
     WORD32 *pi4_cbf)
 {
     /* Qstep value is right shifted by 8 */
     WORD32 threshold = (i4_frm_qstep >> 8);

     /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
     WORD16 *pi2_y0 = pi2_4x4_had;
     WORD16 *pi2_y1 = pi2_4x4_had + 4;
     WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
     WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;

     /* Initialize pointers to store 8x8 HAD output */
     WORD16 *pi2_dst0 = pi2_dst;
     WORD16 *pi2_dst1 = pi2_dst + 4;
     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;

     UWORD32 u4_satd = 0;
     WORD32 i;

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 16; i++)
     {
         WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
         WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);

         WORD16 a0 = pi2_y0[src_idx];
         WORD16 a1 = pi2_y1[src_idx];
         WORD16 a2 = pi2_y2[src_idx];
         WORD16 a3 = pi2_y3[src_idx];

         WORD16 b0 = (a0 + a1);
         WORD16 b1 = (a0 - a1);
         WORD16 b2 = (a2 + a3);
         WORD16 b3 = (a2 - a3);

         pi2_dst0[dst_idx] = b0 + b2;
         pi2_dst1[dst_idx] = b1 + b3;
         pi2_dst2[dst_idx] = b0 - b2;
         pi2_dst3[dst_idx] = b1 - b3;

         if(ABS(pi2_dst0[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst1[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst2[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst3[dst_idx]) > threshold)
             *pi4_cbf = 1;

         u4_satd += ABS(pi2_dst0[dst_idx]);
         u4_satd += ABS(pi2_dst1[dst_idx]);
         u4_satd += ABS(pi2_dst2[dst_idx]);
         u4_satd += ABS(pi2_dst3[dst_idx]);
     }

     /* return the 8x8 satd */
     return (u4_satd);
 }

 /**
 *******************************************************************************
 *
 * @brief
 *    Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
 *    a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
 *    Modified to incorporate the dead-zone implementation - Lokesh
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the transform block
 *
 * @param[in] dst_strd
 *  WORD32 Destination stride
 *
 * @param[out] pi4_hsad
 *  array for storing hadmard sad of each 4x4 block
 *
 * @param[in] hsad_stride
 *  stride of hadmard sad destination buffer (for Zscan order of storing sads)
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *
 * @remarks
 *
 *******************************************************************************
 */
 static WORD32 ihevce_had4_4x4(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst4x4,
     WORD32 dst_strd,
     WORD32 *pi4_hsad,
     WORD32 hsad_stride,
     WORD32 i4_frm_qstep)
 {
     WORD32 i, k;
     WORD32 i4_child_total_sad = 0;

     (void)i4_frm_qstep;
     /* -------- Compute four 4x4 HAD Transforms ---------*/
     for(i = 0; i < 4; i++)
     {
         UWORD8 *pu1_pi0, *pu1_pi1;
         WORD16 *pi2_dst;
         WORD32 blkx, blky;
         UWORD32 u4_hsad = 0;
         // TODO: choose deadzone as f(qstep)
         WORD32 threshold = 0;

         /*****************************************************/
         /*    Assuming the looping structure of the four     */
         /*    blocks is in Z scan order of 4x4s in a 8x8     */
         /*    block instead of raster scan                   */
         /*****************************************************/
         blkx = (i & 0x1);
         blky = (i >> 1);

         pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
         pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
         pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);

         ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);

         for(k = 0; k < 4; k++)
         {
             if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
                 pi2_dst[0 * dst_strd + k] = 0;

             if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
                 pi2_dst[1 * dst_strd + k] = 0;

             if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
                 pi2_dst[2 * dst_strd + k] = 0;

             if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
                 pi2_dst[3 * dst_strd + k] = 0;

             /* Accumulate the SATD */
             u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
             u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
             u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
             u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
         }

         /*===== Normalize the HSAD =====*/
         pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
         i4_child_total_sad += ((u4_hsad + 2) >> 2);
     }
     return i4_child_total_sad;
 }

 /**
 *******************************************************************************
 *
 * @brief
 *    HSAD is returned for the 4, 4x4 in 8x8
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the transform output block
 *
 * @param[out] dst_strd
 *  WORD32 Destination stride
 *
 * @param[out] ppi4_hsad
 *   pointer to base pointers for storing hadmard sads of various
 *   block sizes (4x4 to 32x32)
 *
 * @param[in] pos_x_y_4x4
 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
 *
 * @param[in] num_4x4_in_row
 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
 *
 * @returns
 *
 * @remarks
 *
 *******************************************************************************
 */
 void ihevce_had_8x8_using_4_4x4(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 **ppi4_hsad,
     WORD32 pos_x_y_4x4,
     WORD32 num_4x4_in_row)
 {
     WORD16 ai2_4x4_had[64];
     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
     WORD32 *pi4_4x4_hsad;
     WORD32 *pi4_8x8_hsad;

     (void)pi2_dst;
     (void)dst_strd;
     ASSERT(pos_x >= 0);
     ASSERT(pos_y >= 0);

     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);

     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
     pi4_8x8_hsad[0] = ihevce_had4_4x4(
         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
 }

 /**
 *******************************************************************************
 *
 * @brief
 *    Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
 *    block and its four subblocks(4x4).
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the transform output block
 *
 * @param[out] dst_strd
 *  WORD32 Destination stride
 *
 * @param[out] ppi4_hsad
 *   pointer to base pointers for storing hadmard sads of various
 *   block sizes (4x4 to 32x32)
 *
 * @param[in] pos_x_y_4x4
 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
 *
 * @param[in] num_4x4_in_row
 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *
 * @remarks
 *
 *******************************************************************************
 */
 WORD32 ihevce_had_8x8_using_4_4x4_r(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 **ppi4_hsad,
     WORD32 **ppi4_tu_split,
     WORD32 **ppi4_tu_early_cbf,
     WORD32 pos_x_y_4x4,
     WORD32 num_4x4_in_row,
     WORD32 lambda,
     WORD32 lambda_q_shift,
     WORD32 i4_frm_qstep,
     WORD32 i4_cur_depth,
     WORD32 i4_max_depth,
     WORD32 i4_max_tr_size,
     WORD32 *pi4_tu_split_cost,
     void *pv_func_sel)
 {
     WORD16 ai2_4x4_had[64];
     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
     WORD32 *pi4_4x4_hsad;
     WORD32 *pi4_8x8_hsad;
     WORD32 *pi4_8x8_tu_split;

     WORD32 *pi4_8x8_tu_early_cbf;

     UWORD32 u4_satd;
     WORD32 cost_child = 0, cost_parent = 0;
     WORD32 early_cbf = 0;

     const UWORD8 u1_cur_tr_size = 8;
     /* Stores the best cost for the Current 8x8: Lokesh */
     WORD32 best_cost = 0;

     (void)pv_func_sel;
     ASSERT(pos_x >= 0);
     ASSERT(pos_y >= 0);

     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
     pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
     pi4_8x8_tu_early_cbf =
         ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);

     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
     cost_child = ihevce_had4_4x4(
         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);

     /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
     u4_satd = ihevce_compute_8x8HAD_using_4x4(
         ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);

     /* store the normalized 8x8 satd */
     cost_parent = ((u4_satd + 4) >> 3);

     /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
     cost_child += ((4) * lambda) >> (lambda_q_shift + 1);

     if(i4_cur_depth < i4_max_depth)
     {
         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
         {
             //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
             *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
             best_cost = cost_child;
             best_cost <<= 1;
             best_cost++;
             pi4_8x8_tu_split[0] = 1;
             pi4_8x8_hsad[0] = cost_child;
         }
         else
         {
             //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
             best_cost = cost_parent;
             best_cost <<= 1;
             pi4_8x8_tu_split[0] = 0;
             pi4_8x8_hsad[0] = cost_parent;
         }
     }
     else
     {
         //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
         best_cost = cost_parent;
         best_cost <<= 1;
         pi4_8x8_tu_split[0] = 0;
         pi4_8x8_hsad[0] = cost_parent;
     }

     pi4_8x8_tu_early_cbf[0] = early_cbf;

     /* best cost has tu_split_flag at LSB(Least significant bit) */
     return ((best_cost << 1) + early_cbf);
 }

 /**
 *******************************************************************************
 *
 * @brief
 *   Computes 16x16 transform using children 8x8 hadamard results
 *    Modified to incorporate the dead-zone implementation - Lokesh
 *
 * @par Description:
 *
 * @param[in] pi2_8x8_had
 *  WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
 *
 * @param[in] had8_strd
 *  stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
 *
 * @param[out] pi2_dst
 *  destination buffer where 8x8 hadamard result is stored
 *
 * @param[in] dst_stride
 *  stride of destination block
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *  16x16 Hadamard SATD
 * @remarks
 *
 *******************************************************************************
 */
 static UWORD32 ihevce_compute_16x16HAD_using_8x8(
     WORD16 *pi2_8x8_had,
     WORD32 had8_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 i4_frm_qstep,
     WORD32 *pi4_cbf)
 {
     /* Qstep value is right shifted by 8 */
     WORD32 threshold = (i4_frm_qstep >> 8);

     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
     WORD16 *pi2_y0 = pi2_8x8_had;
     WORD16 *pi2_y1 = pi2_8x8_had + 8;
     WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
     WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;

     /* Initialize pointers to store 8x8 HAD output */
     WORD16 *pi2_dst0 = pi2_dst;
     WORD16 *pi2_dst1 = pi2_dst + 8;
     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;

     UWORD32 u4_satd = 0;
     WORD32 i;

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 64; i++)
     {
         WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
         WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);

         WORD16 a0 = pi2_y0[src_idx];
         WORD16 a1 = pi2_y1[src_idx];
         WORD16 a2 = pi2_y2[src_idx];
         WORD16 a3 = pi2_y3[src_idx];

         WORD16 b0 = (a0 + a1) >> 1;
         WORD16 b1 = (a0 - a1) >> 1;
         WORD16 b2 = (a2 + a3) >> 1;
         WORD16 b3 = (a2 - a3) >> 1;

         pi2_dst0[dst_idx] = b0 + b2;
         pi2_dst1[dst_idx] = b1 + b3;
         pi2_dst2[dst_idx] = b0 - b2;
         pi2_dst3[dst_idx] = b1 - b3;

         /* Make the value of dst to zerp, if it falls below the dead-zone */
         if(ABS(pi2_dst0[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst1[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst2[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst3[dst_idx]) > threshold)
             *pi4_cbf = 1;

         u4_satd += ABS(pi2_dst0[dst_idx]);
         u4_satd += ABS(pi2_dst1[dst_idx]);
         u4_satd += ABS(pi2_dst2[dst_idx]);
         u4_satd += ABS(pi2_dst3[dst_idx]);
     }

     /* return 16x16 satd */
     return (u4_satd);
 }

 /**
 *******************************************************************************
 *
 * @brief
 *    Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
 *    Uses recursive 8x8 had output to compute satd for 16x16 and its children
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the transform output block
 *
 * @param[out] dst_strd
 *  WORD32 Destination stride
 *
 * @param[out] ppi4_hsad
 *   pointer to base pointers for storing hadmard sads of various
 *   block sizes (4x4 to 32x32)
 *
 * @param[in] pos_x_y_4x4
 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
 *
 * @param[in] num_4x4_in_row
 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
 *
 * @param[in] lambda
 *  lambda values is the cost factor calculated based on QP
 *
 * @param[in] lambda_q_shift
 *  lambda_q_shift used to reverse the lambda value back from q8 format
 *
 * @param[in] depth
 *  depth gives the current TU depth with respect to the CU
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *
 * @remarks
 *
 *******************************************************************************
 */

 WORD32 ihevce_had_16x16_r(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 **ppi4_hsad,
     WORD32 **ppi4_tu_split,
     WORD32 **ppi4_tu_early_cbf,
     WORD32 pos_x_y_4x4,
     WORD32 num_4x4_in_row,
     WORD32 lambda,
     WORD32 lambda_q_shift,
     WORD32 i4_frm_qstep,
     WORD32 i4_cur_depth,
     WORD32 i4_max_depth,
     WORD32 i4_max_tr_size,
     WORD32 *pi4_tu_split_cost,
     void *pv_func_sel)
 {
     WORD16 ai2_8x8_had[256];
     WORD32 *pi4_16x16_hsad;
     WORD32 *pi4_16x16_tu_split;

     WORD32 *pi4_16x16_tu_early_cbf;

     UWORD32 u4_satd = 0;
     WORD32 tu_split_flag = 0;
     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
     const UWORD8 u1_cur_tr_size = 16;

     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
     /* cost_child : Stores the cost of the child HAD transform (16x16) */
     WORD32 cost_parent = 0, cost_child = 0;

     /*best_cost returns the best cost at the end of the function */
     /*tu_split denoes whether the TU (16x16)is split or not */
     WORD32 best_cost = 0, best_cost_tu_split;
     WORD32 i;

     WORD16 *pi2_y0;
     UWORD8 *pu1_src0;
     UWORD8 *pu1_pred0;
     WORD32 pos_x_y_4x4_0;

     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;

     ASSERT(pos_x >= 0);
     ASSERT(pos_y >= 0);

     /* Initialize pointers to  store 16x16 SATDs */
     pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);

     pi4_16x16_tu_split =
         ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);

     pi4_16x16_tu_early_cbf =
         ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);

     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
     for(i = 0; i < 4; i++)
     {
         pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
         pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);

         best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
             pu1_src0,
             src_strd,
             pu1_pred0,
             pred_strd,
             pi2_y0,
             16,
             ppi4_hsad,
             ppi4_tu_split,
             ppi4_tu_early_cbf,
             pos_x_y_4x4_0,
             num_4x4_in_row,
             lambda,
             lambda_q_shift,
             i4_frm_qstep,
             i4_cur_depth + 1,
             i4_max_depth,
             i4_max_tr_size,
             pi4_tu_split_cost,
             pv_func_sel);

         /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
         best_cost = (best_cost_tu_split >> 2);

         /* Last but one bit stores the information regarding the TU_Split */
         tu_split_flag += (best_cost_tu_split & 0x3) >> 1;

         /* Last bit stores the information regarding the early_cbf */
         i4_early_cbf_flag += (best_cost_tu_split & 0x1);

         cost_child += best_cost;

         tu_split_flag <<= 1;
         i4_early_cbf_flag <<= 1;
     }

     /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
     pi2_y0 = ai2_8x8_had;

     /* Threshold currently passed as "0" */
     u4_satd =
         ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);

     /* store the normalized satd */
     cost_parent = ((u4_satd + 4) >> 3);

     /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);

     i4_early_cbf_flag += early_cbf;

     /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
     which decides the extent to which TU_REC needs to be done */
     if(i4_cur_depth < i4_max_depth)
     {
         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
         {
             //cost_child -= ((4 + 4)  * lambda) >> (lambda_q_shift + 1);
             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
             tu_split_flag += 1;
             best_cost = cost_child;
         }
         else
         {
             //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
             tu_split_flag += 0;
             best_cost = cost_parent;
         }
     }
     else
     {
         //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
         tu_split_flag += 0;
         best_cost = cost_parent;
     }

     pi4_16x16_hsad[0] = best_cost;
     pi4_16x16_tu_split[0] = tu_split_flag;
     pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;

     /*returning two values(best cost & tu_split_flag) as a single value*/
     return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
 }

 //#endif
 /**
 *******************************************************************************
 *
 * @brief
 *   Computes 32x32 transform using children 16x16 hadamard results
 *
 * @par Description:
 *
 * @param[in] pi2_16x16_had
 *  WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
 *
 * @param[in] had16_strd
 *  stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
 *
 * @param[out] pi2_dst
 *  destination buffer where 16x16 hadamard result is stored
 *
 * @param[in] dst_stride
 *  stride of destination block
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 * @returns
 *  32x32 Hadamard SATD
 * @remarks
 *
 *******************************************************************************
 */
 //#if COMPUTE_32x32_USING_16X16 == C
 UWORD32 ihevce_compute_32x32HAD_using_16x16(
     WORD16 *pi2_16x16_had,
     WORD32 had16_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 i4_frm_qstep,
     WORD32 *pi4_cbf)
 {
     /* Qstep value is right shifted by 8 */
     WORD32 threshold = (i4_frm_qstep >> 8);

     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
     WORD16 *pi2_y0 = pi2_16x16_had;
     WORD16 *pi2_y1 = pi2_16x16_had + 16;
     WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
     WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;

     /* Initialize pointers to store 8x8 HAD output */
     WORD16 *pi2_dst0 = pi2_dst;
     WORD16 *pi2_dst1 = pi2_dst + 16;
     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;

     UWORD32 u4_satd = 0;
     WORD32 i;

     /*   Child HAD results combined as follows to get Parent result */
     /*  _                                                 _         */
     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
     /* \-                                                 -/        */
     for(i = 0; i < 256; i++)
     {
         WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
         WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);

         WORD16 a0 = pi2_y0[src_idx] >> 2;
         WORD16 a1 = pi2_y1[src_idx] >> 2;
         WORD16 a2 = pi2_y2[src_idx] >> 2;
         WORD16 a3 = pi2_y3[src_idx] >> 2;

         WORD16 b0 = (a0 + a1);
         WORD16 b1 = (a0 - a1);
         WORD16 b2 = (a2 + a3);
         WORD16 b3 = (a2 - a3);

         pi2_dst0[dst_idx] = b0 + b2;
         pi2_dst1[dst_idx] = b1 + b3;
         pi2_dst2[dst_idx] = b0 - b2;
         pi2_dst3[dst_idx] = b1 - b3;

         /* Make the value of dst to zerp, if it falls below the dead-zone */
         if(ABS(pi2_dst0[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst1[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst2[dst_idx]) > threshold)
             *pi4_cbf = 1;
         if(ABS(pi2_dst3[dst_idx]) > threshold)
             *pi4_cbf = 1;

         u4_satd += ABS(pi2_dst0[dst_idx]);
         u4_satd += ABS(pi2_dst1[dst_idx]);
         u4_satd += ABS(pi2_dst2[dst_idx]);
         u4_satd += ABS(pi2_dst3[dst_idx]);
     }

     /* return 32x32 satd */
     return (u4_satd);
 }
 //#endif

 /**
 *******************************************************************************
 *
 * @brief
 *    Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
 *    Uses recursive 16x16 had output to compute satd for 32x32 and its children
 *
 * @par Description:
 *
 * @param[in] pu1_origin
 *  UWORD8 pointer to the current block
 *
 * @param[in] src_strd
 *  WORD32 Source stride
 *
 * @param[in] pu1_pred
 *  UWORD8 pointer to the prediction block
 *
 * @param[in] pred_strd
 *  WORD32 Pred stride
 *
 * @param[out] pi2_dst
 *  WORD16 pointer to the transform output block
 *
 * @param[out] dst_strd
 *  WORD32 Destination stride
 *
 * @param[out] ppi4_hsad
 *   pointer to base pointers for storing hadmard sads of various
 *   block sizes (4x4 to 32x32)
 *
 * @param[in] pos_x_y_4x4
 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
 *
 * @param[in] num_4x4_in_row
 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
 *
 * @param[in] lambda
 *  lambda values is the cost factor calculated based on QP
 *
 * @param[in] lambda_q_shift
 *  lambda_q_shift used to reverse the lambda value back from q8 format
 *
 * @param[in] depth
 *  depth gives the current TU depth with respect to the CU
 *
 * @param[in] i4_frm_qstep
 *  frm_qstep value based on the which the threshold value is calculated
 *
 *
 * @returns
 *
 * @remarks
 *
 *******************************************************************************
 */
 void ihevce_had_32x32_r(
     UWORD8 *pu1_src,
     WORD32 src_strd,
     UWORD8 *pu1_pred,
     WORD32 pred_strd,
     WORD16 *pi2_dst,
     WORD32 dst_strd,
     WORD32 **ppi4_hsad,
     WORD32 **ppi4_tu_split,
     WORD32 **ppi4_tu_early_cbf,
     WORD32 pos_x_y_4x4,
     WORD32 num_4x4_in_row,
     WORD32 lambda,
     WORD32 lambda_q_shift,
     WORD32 i4_frm_qstep,
     WORD32 i4_cur_depth,
     WORD32 i4_max_depth,
     WORD32 i4_max_tr_size,
     WORD32 *pi4_tu_split_cost,
     me_func_selector_t *ps_func_selector)

 {
     WORD16 ai2_16x16_had[1024];
     WORD32 *pi4_32x32_hsad;
     WORD32 *pi4_32x32_tu_split;
     WORD32 *pi4_32x32_tu_early_cbf;

     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
     WORD32 tu_split_flag = 0;
     const UWORD8 u1_cur_tr_size = 32;
     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;

     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
     /* cost_child : Stores the cost of the child HAD transform (16x16) */
     WORD32 cost_child = 0, cost_parent = 0;

     /*retuned as the best cost for the entire TU (32x32) */
     WORD32 best_cost = 0;
     /*captures the best cost and tu_split at child level */
     WORD32 best_cost_tu_split;

     /* Initialize pointers to 4 8x8 blocks in 16x16 */
     WORD16 *pi2_y0 = ai2_16x16_had;
     WORD16 *pi2_y1 = ai2_16x16_had + 16;
     WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
     WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;

     UWORD8 *pu1_src0 = pu1_src;
     UWORD8 *pu1_src1 = pu1_src + 16;
     UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
     UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;

     UWORD8 *pu1_pred0 = pu1_pred;
     UWORD8 *pu1_pred1 = pu1_pred + 16;
     UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
     UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;

     ASSERT(pos_x >= 0);
     ASSERT(pos_y >= 0);

     /* Initialize pointers to store 32x32 SATDs */
     pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);

     pi4_32x32_tu_split =
         ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);

     pi4_32x32_tu_early_cbf =
         ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);

     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
         pu1_src0,
         src_strd,
         pu1_pred0,
         pred_strd,
         pi2_y0,
         32,
         ppi4_hsad,
         ppi4_tu_split,
         ppi4_tu_early_cbf,
         pos_x_y_4x4,
         num_4x4_in_row,
         lambda,
         lambda_q_shift,
         i4_frm_qstep,
         i4_cur_depth + 1,
         i4_max_depth,
         i4_max_tr_size,
         pi4_tu_split_cost,
         NULL);

     /* cost is shifted by 10bits */
     best_cost = best_cost_tu_split >> 10;

     /* Tu split is present in the 6-10 bits */
     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;

     /*Early CBF info is present in the last 5 bits */
     i4_early_cbf_flag += best_cost_tu_split & 0x1F;

     tu_split_flag <<= 5;
     i4_early_cbf_flag <<= 5;

     cost_child += best_cost;

     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
         pu1_src1,
         src_strd,
         pu1_pred1,
         pred_strd,
         pi2_y1,
         32,
         ppi4_hsad,
         ppi4_tu_split,
         ppi4_tu_early_cbf,
         pos_x_y_4x4 + 4,
         num_4x4_in_row,
         lambda,
         lambda_q_shift,
         i4_frm_qstep,
         i4_cur_depth + 1,
         i4_max_depth,
         i4_max_tr_size,
         pi4_tu_split_cost,
         NULL);

     /* cost is shifted by 10bits */
     best_cost = best_cost_tu_split >> 10;

     /* Tu split is present in the 6-10 bits */
     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;

     /*Early CBF info is present in the last 5 bits */
     i4_early_cbf_flag += best_cost_tu_split & 0x1F;

     tu_split_flag <<= 5;
     i4_early_cbf_flag <<= 5;

     cost_child += best_cost;

     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
         pu1_src2,
         src_strd,
         pu1_pred2,
         pred_strd,
         pi2_y2,
         32,
         ppi4_hsad,
         ppi4_tu_split,
         ppi4_tu_early_cbf,
         pos_x_y_4x4 + (4 << 16),
         num_4x4_in_row,
         lambda,
         lambda_q_shift,
         i4_frm_qstep,
         i4_cur_depth + 1,
         i4_max_depth,
         i4_max_tr_size,
         pi4_tu_split_cost,
         NULL);

     /* cost is shifted by 10bits */
     best_cost = best_cost_tu_split >> 10;

     /* Tu split is present in the 6-10 bits */
     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;

     /*Early CBF info is present in the last 5 bits */
     i4_early_cbf_flag += best_cost_tu_split & 0x1F;

     tu_split_flag <<= 5;
     i4_early_cbf_flag <<= 5;

     cost_child += best_cost;

     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
         pu1_src3,
         src_strd,
         pu1_pred3,
         pred_strd,
         pi2_y3,
         32,
         ppi4_hsad,
         ppi4_tu_split,
         ppi4_tu_early_cbf,
         pos_x_y_4x4 + (4 << 16) + 4,
         num_4x4_in_row,
         lambda,
         lambda_q_shift,
         i4_frm_qstep,
         i4_cur_depth + 1,
         i4_max_depth,
         i4_max_tr_size,
         pi4_tu_split_cost,
         NULL);

     /* cost is shifted by 10bits */
     best_cost = best_cost_tu_split >> 10;

     /* Tu split is present in the 6-10 bits */
     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;

     /*Early CBF info is present in the last 5 bits */
     i4_early_cbf_flag += best_cost_tu_split & 0x1F;

     tu_split_flag <<= 1;
     i4_early_cbf_flag <<= 1;

     cost_child += best_cost;

     {
         UWORD32 u4_satd = 0;

         u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
             pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);

         cost_parent = ((u4_satd + 2) >> 2);
     }

     /* 4 TU_Split flags , 4 CBF Flags*/
     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);

     i4_early_cbf_flag += early_cbf;

     /* 1 TU_SPlit flag, 1 CBF flag */
     //cost_parent += ((1 + 1)* lambda) >>  (lambda_q_shift + 1);

     if(i4_cur_depth < i4_max_depth)
     {
         if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
         {
             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
             best_cost = cost_child;
             tu_split_flag++;
         }
         else
         {
             tu_split_flag = 0;
             best_cost = cost_parent;
         }
     }
     else
     {
         tu_split_flag = 0;
         best_cost = cost_parent;
     }

     pi4_32x32_tu_split[0] = tu_split_flag;

     pi4_32x32_hsad[0] = best_cost;

     pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
 }