media/libstagefright/codecs/m4v_h263/dec/src/idct.cpp - platform/frameworks/av - Git at Google

 /* ------------------------------------------------------------------
  * Copyright (C) 1998-2009 PacketVideo
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  * express or implied.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  * -------------------------------------------------------------------
  */
 /*
 ------------------------------------------------------------------------------
  MODULE DESCRIPTION

  This file contains the functions that transform an 8r8 image block from
  dequantized DCT coefficients to spatial domain pirel values by calculating
  inverse discrete cosine transform (IDCT).

 ------------------------------------------------------------------------------
 */
 /*----------------------------------------------------------------------------
 ; INCLUDES
 ----------------------------------------------------------------------------*/
 #include "mp4dec_lib.h"
 #include "idct.h"
 #include "motion_comp.h"
 #ifndef FAST_IDCT

 /*
 ------------------------------------------------------------------------------
  FUNCTION NAME: idct
 ------------------------------------------------------------------------------
  INPUT AND OUTPUT DEFINITIONS FOR idct

  Inputs:
     blk = pointer to the buffer containing the dequantized DCT
           coefficients of type int for an 8r8 image block;
           values range from (-2048, 2047) which defined as standard.

  Local Stores/Buffers/Pointers Needed:
     None

  Global Stores/Buffers/Pointers Needed:
     None

  Outputs:
     None

  Pointers and Buffers Modified:
     blk points to the found IDCT values for an 8r8 image block.

  Local Stores Modified:
     None

  Global Stores Modified:
     None

 ------------------------------------------------------------------------------
  FUNCTION DESCRIPTION FOR idct

  This function transforms an 8r8 image block from dequantized DCT coefficients
  (F(u,v)) to spatial domain pirel values (f(r,y)) by performing the two
  dimensional inverse discrete cosine transform (IDCT).

          _7_ _7_      C(u) C(v)
     f(r,y) = \   \  F(u,v)---- ----cos[(2r+1)*u*pi/16]cos[(2y+1)*v*pi/16]
          /__ /__    2    2
          u=0 v=0

     where   C(i) = 1/sqrt(2)    if i=0
         C(i) = 1        otherwise

  2-D IDCT can be separated as horizontal(row-wise) and vertical(column-wise)
  1-D IDCTs. Therefore, 2-D IDCT values are found by the following two steps:
  1. Find horizontal 1-D IDCT values for each row from 8r8 dequantized DCT
     coefficients by row IDCT operation.

           _7_        C(u)
     g(r,v) =  \   F(u,v) ---- cos[(2r+1)*u*pi/16]
           /__         2
           u=0

  2. Find vertical 1-D IDCT values for each column from the results of 1
     by column IDCT operation.

               _7_        C(v)
     f(r,y) =  \   g(r,v) ---- cos[(2y+1)*v*pi/16]
           /__         2
           v=0

 ------------------------------------------------------------------------------
  REQUIREMENTS FOR idct

  None

 ------------------------------------------------------------------------------
 */
 /*  REFERENCES FOR idct */
 /* idct.c, inverse fast discrete cosine transform
  inverse two dimensional DCT, Chen-Wang algorithm
  (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)
  32-bit integer arithmetic (8 bit coefficients)
  11 mults, 29 adds per DCT
  sE, 18.8.91

  coefficients ertended to 12 bit for IEEE1180-1990
  compliance                           sE,  2.1.94
 */


 /*----------------------------------------------------------------------------
 ; Function Code FOR idct
 ----------------------------------------------------------------------------*/
 void idct_intra(
     int *blk, uint8 *comp, int width
 )
 {
     /*----------------------------------------------------------------------------
     ; Define all local variables
     ----------------------------------------------------------------------------*/
     int i;
     int32   tmpBLK[64];
     int32   *tmpBLK32 = &tmpBLK[0];
     int32   r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
     int32   a;
     int offset = width - 8;
     /*----------------------------------------------------------------------------
     ; Function body here
     ----------------------------------------------------------------------------*/
     /* two dimensional inverse discrete cosine transform */


     /* column (vertical) IDCT */
     for (i = B_SIZE - 1; i >= 0; i--)
     {
         /* initialize butterfly nodes at first stage */

         r1 = blk[B_SIZE * 4 + i] << 11;
         /* since row IDCT results have net left shift by 3 */
         /* this left shift by 8 gives net left shift by 11 */
         /* in order to maintain the same scale as that of  */
         /* coefficients Wi */

         r2 = blk[B_SIZE * 6 + i];
         r3 = blk[B_SIZE * 2 + i];
         r4 = blk[B_SIZE * 1 + i];
         r5 = blk[B_SIZE * 7 + i];
         r6 = blk[B_SIZE * 5 + i];
         r7 = blk[B_SIZE * 3 + i];

         if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
         {
             /* shortcut */
             /* execute if values of g(r,1) to g(r,7) in a column*/
             /* are all zeros */

             /* make output of IDCT >>3 or scaled by 1/8 and */
             /* with the proper rounding */
             a = (blk[B_SIZE * 0 + i]) << 3;
             tmpBLK32[B_SIZE * 0 + i] = a;
             tmpBLK32[B_SIZE * 1 + i] = a;
             tmpBLK32[B_SIZE * 2 + i] = a;
             tmpBLK32[B_SIZE * 3 + i] = a;
             tmpBLK32[B_SIZE * 4 + i] = a;
             tmpBLK32[B_SIZE * 5 + i] = a;
             tmpBLK32[B_SIZE * 6 + i] = a;
             tmpBLK32[B_SIZE * 7 + i] = a;
         }
         else
         {
             r0 = (blk[8 * 0 + i] << 11) + 128;

             /* first stage */

             r8 = W7 * (r4 + r5);
             r4 = (r8 + (W1 - W7) * r4);
             /* Multiplication with Wi increases the net left */
             /* shift from 11 to 14,we have to shift back by 3*/
             r5 = (r8 - (W1 + W7) * r5);
             r8 = W3 * (r6 + r7);
             r6 = (r8 - (W3 - W5) * r6);
             r7 = (r8 - (W3 + W5) * r7);

             /* second stage */
             r8 = r0 + r1;
             r0 -= r1;

             r1 = W6 * (r3 + r2);
             r2 = (r1 - (W2 + W6) * r2);
             r3 = (r1 + (W2 - W6) * r3);

             r1 = r4 + r6;
             r4 -= r6;
             r6 = r5 + r7;
             r5 -= r7;

             /* third stage */
             r7 = r8 + r3;
             r8 -= r3;
             r3 = r0 + r2;
             r0 -= r2;
             r2 = (181 * (r4 + r5) + 128) >> 8;  /* rounding */
             r4 = (181 * (r4 - r5) + 128) >> 8;

             /* fourth stage */
             /* net shift of IDCT is >>3 after the following */
             /* shift operation, it makes output of 2-D IDCT */
             /* scaled by 1/8, that is scaled twice by       */
             /* 1/(2*sqrt(2)) for row IDCT and column IDCT.  */
             /* see detail analysis in design doc.           */
             tmpBLK32[0 + i] = (r7 + r1) >> 8;
             tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
             tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
             tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
             tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
             tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
             tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
             tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
         }
     }
     /* row (horizontal) IDCT */
     for (i = 0 ; i < B_SIZE; i++)
     {
         /* initialize butterfly nodes at the first stage */

         r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
         /* r1 left shift by 11 is to maintain the same  */
         /* scale as that of coefficients (W1,...W7) */
         /* since blk[4] won't multiply with Wi.     */
         /* see detail diagram in design document.   */

         r2 = tmpBLK32[6+(i<<3)];
         r3 = tmpBLK32[2+(i<<3)];
         r4 = tmpBLK32[1+(i<<3)];
         r5 = tmpBLK32[7+(i<<3)];
         r6 = tmpBLK32[5+(i<<3)];
         r7 = tmpBLK32[3+(i<<3)];

         if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
         {
             /* shortcut */
             /* execute if values of F(1,v) to F(7,v) in a row*/
             /* are all zeros */

             /* output of row IDCT scaled by 8 */
             a = (((int32)tmpBLK32[0+(i<<3)] + 32) >> 6);
             CLIP_RESULT(a)
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;
             *comp++ = a;

             comp += offset;
         }

         else
         {
             /* for proper rounding in the fourth stage */
             r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

             /* first stage */

             r8 = W7 * (r4 + r5) + 4;
             r4 = (r8 + (W1 - W7) * r4) >> 3;
             r5 = (r8 - (W1 + W7) * r5) >> 3;

             r8 = W3 * (r6 + r7) + 4;
             r6 = (r8 - (W3 - W5) * r6) >> 3;
             r7 = (r8 - (W3 + W5) * r7) >> 3;

             /* second stage */
             r8 = r0 + r1;
             r0 -= r1;

             r1 = W6 * (r3 + r2) + 4;
             r2 = (r1 - (W2 + W6) * r2) >> 3;
             r3 = (r1 + (W2 - W6) * r3) >> 3;

             r1 = r4 + r6;
             r4 -= r6;
             r6 = r5 + r7;
             r5 -= r7;

             /* third stage */
             r7 = r8 + r3;
             r8 -= r3;
             r3 = r0 + r2;
             r0 -= r2;
             r2 = (181 * (r4 + r5) + 128) >> 8;    /* rounding */
             r4 = (181 * (r4 - r5) + 128) >> 8;

             /* fourth stage */
             /* net shift of this function is <<3 after the    */
             /* following shift operation, it makes output of  */
             /* row IDCT scaled by 8 to retain 3 bits precision*/
             a = ((r7 + r1) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r3 + r2) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r0 + r4) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r8 + r6) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r8 - r6) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r0 - r4) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r3 - r2) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;
             a = ((r7 - r1) >> 14);
             CLIP_RESULT(a)
             *comp++ = a;

             comp += offset;
         }
     }


     /*----------------------------------------------------------------------------
     ; Return nothing or data or data pointer
     ----------------------------------------------------------------------------*/
     return;
 }

 void idct(
     int *blk, uint8 *pred, uint8 *dst, int width)
 {
     /*----------------------------------------------------------------------------
     ; Define all local variables
     ----------------------------------------------------------------------------*/
     int i;
     int32   tmpBLK[64];
     int32   *tmpBLK32 = &tmpBLK[0];
     int32   r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
     int32   a;
     int res;

     /*----------------------------------------------------------------------------
     ; Function body here
     ----------------------------------------------------------------------------*/
     /* two dimensional inverse discrete cosine transform */


     /* column (vertical) IDCT */
     for (i = B_SIZE - 1; i >= 0; i--)
     {
         /* initialize butterfly nodes at first stage */

         r1 = blk[B_SIZE * 4 + i] << 11;
         /* since row IDCT results have net left shift by 3 */
         /* this left shift by 8 gives net left shift by 11 */
         /* in order to maintain the same scale as that of  */
         /* coefficients Wi */

         r2 = blk[B_SIZE * 6 + i];
         r3 = blk[B_SIZE * 2 + i];
         r4 = blk[B_SIZE * 1 + i];
         r5 = blk[B_SIZE * 7 + i];
         r6 = blk[B_SIZE * 5 + i];
         r7 = blk[B_SIZE * 3 + i];

         if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
         {
             /* shortcut */
             /* execute if values of g(r,1) to g(r,7) in a column*/
             /* are all zeros */

             /* make output of IDCT >>3 or scaled by 1/8 and */
             /* with the proper rounding */
             a = (blk[B_SIZE * 0 + i]) << 3;
             tmpBLK32[B_SIZE * 0 + i] = a;
             tmpBLK32[B_SIZE * 1 + i] = a;
             tmpBLK32[B_SIZE * 2 + i] = a;
             tmpBLK32[B_SIZE * 3 + i] = a;
             tmpBLK32[B_SIZE * 4 + i] = a;
             tmpBLK32[B_SIZE * 5 + i] = a;
             tmpBLK32[B_SIZE * 6 + i] = a;
             tmpBLK32[B_SIZE * 7 + i] = a;
         }
         else
         {
             r0 = (blk[8 * 0 + i] << 11) + 128;

             /* first stage */

             r8 = W7 * (r4 + r5);
             r4 = (r8 + (W1 - W7) * r4);
             /* Multiplication with Wi increases the net left */
             /* shift from 11 to 14,we have to shift back by 3*/
             r5 = (r8 - (W1 + W7) * r5);
             r8 = W3 * (r6 + r7);
             r6 = (r8 - (W3 - W5) * r6);
             r7 = (r8 - (W3 + W5) * r7);

             /* second stage */
             r8 = r0 + r1;
             r0 -= r1;

             r1 = W6 * (r3 + r2);
             r2 = (r1 - (W2 + W6) * r2);
             r3 = (r1 + (W2 - W6) * r3);

             r1 = r4 + r6;
             r4 -= r6;
             r6 = r5 + r7;
             r5 -= r7;

             /* third stage */
             r7 = r8 + r3;
             r8 -= r3;
             r3 = r0 + r2;
             r0 -= r2;
             r2 = (181 * (r4 + r5) + 128) >> 8;  /* rounding */
             r4 = (181 * (r4 - r5) + 128) >> 8;

             /* fourth stage */
             /* net shift of IDCT is >>3 after the following */
             /* shift operation, it makes output of 2-D IDCT */
             /* scaled by 1/8, that is scaled twice by       */
             /* 1/(2*sqrt(2)) for row IDCT and column IDCT.  */
             /* see detail analysis in design doc.           */
             tmpBLK32[0 + i] = (r7 + r1) >> 8;
             tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
             tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
             tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
             tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
             tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
             tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
             tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
         }
     }
     /* row (horizontal) IDCT */
     for (i = B_SIZE - 1; i >= 0; i--)
     {
         /* initialize butterfly nodes at the first stage */

         r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
         /* r1 left shift by 11 is to maintain the same  */
         /* scale as that of coefficients (W1,...W7) */
         /* since blk[4] won't multiply with Wi.     */
         /* see detail diagram in design document.   */

         r2 = tmpBLK32[6+(i<<3)];
         r3 = tmpBLK32[2+(i<<3)];
         r4 = tmpBLK32[1+(i<<3)];
         r5 = tmpBLK32[7+(i<<3)];
         r6 = tmpBLK32[5+(i<<3)];
         r7 = tmpBLK32[3+(i<<3)];

         if (!(r1 | r2 | r3 | r4 | r5 | r6 | r7))
         {
             /* shortcut */
             /* execute if values of F(1,v) to F(7,v) in a row*/
             /* are all zeros */

             /* output of row IDCT scaled by 8 */
             a = (tmpBLK32[0+(i<<3)] + 32) >> 6;
             blk[0+(i<<3)] = a;
             blk[1+(i<<3)] = a;
             blk[2+(i<<3)] = a;
             blk[3+(i<<3)] = a;
             blk[4+(i<<3)] = a;
             blk[5+(i<<3)] = a;
             blk[6+(i<<3)] = a;
             blk[7+(i<<3)] = a;

         }

         else
         {
             /* for proper rounding in the fourth stage */
             r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

             /* first stage */

             r8 = W7 * (r4 + r5) + 4;
             r4 = (r8 + (W1 - W7) * r4) >> 3;
             r5 = (r8 - (W1 + W7) * r5) >> 3;

             r8 = W3 * (r6 + r7) + 4;
             r6 = (r8 - (W3 - W5) * r6) >> 3;
             r7 = (r8 - (W3 + W5) * r7) >> 3;

             /* second stage */
             r8 = r0 + r1;
             r0 -= r1;

             r1 = W6 * (r3 + r2) + 4;
             r2 = (r1 - (W2 + W6) * r2) >> 3;
             r3 = (r1 + (W2 - W6) * r3) >> 3;

             r1 = r4 + r6;
             r4 -= r6;
             r6 = r5 + r7;
             r5 -= r7;

             /* third stage */
             r7 = r8 + r3;
             r8 -= r3;
             r3 = r0 + r2;
             r0 -= r2;
             r2 = (181 * (r4 + r5) + 128) >> 8;    /* rounding */
             r4 = (181 * (r4 - r5) + 128) >> 8;

             /* fourth stage */
             /* net shift of this function is <<3 after the    */
             /* following shift operation, it makes output of  */
             /* row IDCT scaled by 8 to retain 3 bits precision*/
             blk[0+(i<<3)] = (r7 + r1) >> 14;
             blk[1+(i<<3)] = (r3 + r2) >> 14;
             blk[2+(i<<3)] = (r0 + r4) >> 14;
             blk[3+(i<<3)] = (r8 + r6) >> 14;
             blk[4+(i<<3)] = (r8 - r6) >> 14;
             blk[5+(i<<3)] = (r0 - r4) >> 14;
             blk[6+(i<<3)] = (r3 - r2) >> 14;
             blk[7+(i<<3)] = (r7 - r1) >> 14;
         }
         /*  add with prediction ,  08/03/05 */
         res = (*pred++ + block[0+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[1+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[2+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[3+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[4+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[5+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[6+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;
         res = (*pred++ + block[7+(i<<3)]);
         CLIP_RESULT(res);
         *dst++ = res;

         pred += 8;
         dst += (width - 8);
     }


     /*----------------------------------------------------------------------------
     ; Return nothing or data or data pointer
     ----------------------------------------------------------------------------*/
     return;
 }

 #endif
 /*----------------------------------------------------------------------------
 ; End Function: idct
 ----------------------------------------------------------------------------*/
	/* ------------------------------------------------------------------
	* Copyright (C) 1998-2009 PacketVideo
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	* express or implied.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	* -------------------------------------------------------------------
	*/
	/*
	------------------------------------------------------------------------------
	MODULE DESCRIPTION

	This file contains the functions that transform an 8r8 image block from
	dequantized DCT coefficients to spatial domain pirel values by calculating
	inverse discrete cosine transform (IDCT).

	------------------------------------------------------------------------------
	*/
	/*----------------------------------------------------------------------------
	; INCLUDES
	----------------------------------------------------------------------------*/
	#include "mp4dec_lib.h"
	#include "idct.h"
	#include "motion_comp.h"
	#ifndef FAST_IDCT

	/*
	------------------------------------------------------------------------------
	FUNCTION NAME: idct
	------------------------------------------------------------------------------
	INPUT AND OUTPUT DEFINITIONS FOR idct

	Inputs:
	blk = pointer to the buffer containing the dequantized DCT
	coefficients of type int for an 8r8 image block;
	values range from (-2048, 2047) which defined as standard.

	Local Stores/Buffers/Pointers Needed:
	None

	Global Stores/Buffers/Pointers Needed:
	None

	Outputs:
	None

	Pointers and Buffers Modified:
	blk points to the found IDCT values for an 8r8 image block.

	Local Stores Modified:
	None

	Global Stores Modified:
	None

	------------------------------------------------------------------------------
	FUNCTION DESCRIPTION FOR idct

	This function transforms an 8r8 image block from dequantized DCT coefficients
	(F(u,v)) to spatial domain pirel values (f(r,y)) by performing the two
	dimensional inverse discrete cosine transform (IDCT).

	_7_ _7_ C(u) C(v)
	f(r,y) = \ \ F(u,v)---- ----cos[(2r+1)upi/16]cos[(2y+1)vpi/16]
	/__ /__ 2 2
	u=0 v=0

	where C(i) = 1/sqrt(2) if i=0
	C(i) = 1 otherwise

	2-D IDCT can be separated as horizontal(row-wise) and vertical(column-wise)
	1-D IDCTs. Therefore, 2-D IDCT values are found by the following two steps:
	1. Find horizontal 1-D IDCT values for each row from 8r8 dequantized DCT
	coefficients by row IDCT operation.

	_7_ C(u)
	g(r,v) = \ F(u,v) ---- cos[(2r+1)upi/16]
	/__ 2
	u=0

	2. Find vertical 1-D IDCT values for each column from the results of 1
	by column IDCT operation.

	_7_ C(v)
	f(r,y) = \ g(r,v) ---- cos[(2y+1)vpi/16]
	/__ 2
	v=0

	------------------------------------------------------------------------------
	REQUIREMENTS FOR idct

	None

	------------------------------------------------------------------------------
	*/
	/* REFERENCES FOR idct */
	/* idct.c, inverse fast discrete cosine transform
	inverse two dimensional DCT, Chen-Wang algorithm
	(cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)
	32-bit integer arithmetic (8 bit coefficients)
	11 mults, 29 adds per DCT
	sE, 18.8.91

	coefficients ertended to 12 bit for IEEE1180-1990
	compliance sE, 2.1.94
	*/


	/*----------------------------------------------------------------------------
	; Function Code FOR idct
	----------------------------------------------------------------------------*/
	void idct_intra(
	int blk, uint8 comp, int width
	)
	{
	/*----------------------------------------------------------------------------
	; Define all local variables
	----------------------------------------------------------------------------*/
	int i;
	int32 tmpBLK[64];
	int32 *tmpBLK32 = &tmpBLK[0];
	int32 r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
	int32 a;
	int offset = width - 8;
	/*----------------------------------------------------------------------------
	; Function body here
	----------------------------------------------------------------------------*/
	/* two dimensional inverse discrete cosine transform */


	/* column (vertical) IDCT */
	for (i = B_SIZE - 1; i >= 0; i--)
	{
	/* initialize butterfly nodes at first stage */

	r1 = blk[B_SIZE * 4 + i] << 11;
	/* since row IDCT results have net left shift by 3 */
	/* this left shift by 8 gives net left shift by 11 */
	/* in order to maintain the same scale as that of */
	/* coefficients Wi */

	r2 = blk[B_SIZE * 6 + i];
	r3 = blk[B_SIZE * 2 + i];
	r4 = blk[B_SIZE * 1 + i];
	r5 = blk[B_SIZE * 7 + i];
	r6 = blk[B_SIZE * 5 + i];
	r7 = blk[B_SIZE * 3 + i];

	if (!(r1 \| r2 \| r3 \| r4 \| r5 \| r6 \| r7))
	{
	/* shortcut */
	/* execute if values of g(r,1) to g(r,7) in a column*/
	/* are all zeros */

	/* make output of IDCT >>3 or scaled by 1/8 and */
	/* with the proper rounding */
	a = (blk[B_SIZE * 0 + i]) << 3;
	tmpBLK32[B_SIZE * 0 + i] = a;
	tmpBLK32[B_SIZE * 1 + i] = a;
	tmpBLK32[B_SIZE * 2 + i] = a;
	tmpBLK32[B_SIZE * 3 + i] = a;
	tmpBLK32[B_SIZE * 4 + i] = a;
	tmpBLK32[B_SIZE * 5 + i] = a;
	tmpBLK32[B_SIZE * 6 + i] = a;
	tmpBLK32[B_SIZE * 7 + i] = a;
	}
	else
	{
	r0 = (blk[8 * 0 + i] << 11) + 128;

	/* first stage */

	r8 = W7 * (r4 + r5);
	r4 = (r8 + (W1 - W7) * r4);
	/* Multiplication with Wi increases the net left */
	/* shift from 11 to 14,we have to shift back by 3*/
	r5 = (r8 - (W1 + W7) * r5);
	r8 = W3 * (r6 + r7);
	r6 = (r8 - (W3 - W5) * r6);
	r7 = (r8 - (W3 + W5) * r7);

	/* second stage */
	r8 = r0 + r1;
	r0 -= r1;

	r1 = W6 * (r3 + r2);
	r2 = (r1 - (W2 + W6) * r2);
	r3 = (r1 + (W2 - W6) * r3);

	r1 = r4 + r6;
	r4 -= r6;
	r6 = r5 + r7;
	r5 -= r7;

	/* third stage */
	r7 = r8 + r3;
	r8 -= r3;
	r3 = r0 + r2;
	r0 -= r2;
	r2 = (181 * (r4 + r5) + 128) >> 8; /* rounding */
	r4 = (181 * (r4 - r5) + 128) >> 8;

	/* fourth stage */
	/* net shift of IDCT is >>3 after the following */
	/* shift operation, it makes output of 2-D IDCT */
	/* scaled by 1/8, that is scaled twice by */
	/* 1/(2sqrt(2)) for row IDCT and column IDCT. /
	/* see detail analysis in design doc. */
	tmpBLK32[0 + i] = (r7 + r1) >> 8;
	tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
	tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
	tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
	tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
	tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
	tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
	tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
	}
	}
	/* row (horizontal) IDCT */
	for (i = 0 ; i < B_SIZE; i++)
	{
	/* initialize butterfly nodes at the first stage */

	r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
	/* r1 left shift by 11 is to maintain the same */
	/* scale as that of coefficients (W1,...W7) */
	/* since blk[4] won't multiply with Wi. */
	/* see detail diagram in design document. */

	r2 = tmpBLK32[6+(i<<3)];
	r3 = tmpBLK32[2+(i<<3)];
	r4 = tmpBLK32[1+(i<<3)];
	r5 = tmpBLK32[7+(i<<3)];
	r6 = tmpBLK32[5+(i<<3)];
	r7 = tmpBLK32[3+(i<<3)];

	if (!(r1 \| r2 \| r3 \| r4 \| r5 \| r6 \| r7))
	{
	/* shortcut */
	/* execute if values of F(1,v) to F(7,v) in a row*/
	/* are all zeros */

	/* output of row IDCT scaled by 8 */
	a = (((int32)tmpBLK32[0+(i<<3)] + 32) >> 6);
	CLIP_RESULT(a)
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;
	*comp++ = a;

	comp += offset;
	}

	else
	{
	/* for proper rounding in the fourth stage */
	r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

	/* first stage */

	r8 = W7 * (r4 + r5) + 4;
	r4 = (r8 + (W1 - W7) * r4) >> 3;
	r5 = (r8 - (W1 + W7) * r5) >> 3;

	r8 = W3 * (r6 + r7) + 4;
	r6 = (r8 - (W3 - W5) * r6) >> 3;
	r7 = (r8 - (W3 + W5) * r7) >> 3;

	/* second stage */
	r8 = r0 + r1;
	r0 -= r1;

	r1 = W6 * (r3 + r2) + 4;
	r2 = (r1 - (W2 + W6) * r2) >> 3;
	r3 = (r1 + (W2 - W6) * r3) >> 3;

	r1 = r4 + r6;
	r4 -= r6;
	r6 = r5 + r7;
	r5 -= r7;

	/* third stage */
	r7 = r8 + r3;
	r8 -= r3;
	r3 = r0 + r2;
	r0 -= r2;
	r2 = (181 * (r4 + r5) + 128) >> 8; /* rounding */
	r4 = (181 * (r4 - r5) + 128) >> 8;

	/* fourth stage */
	/* net shift of this function is <<3 after the */
	/* following shift operation, it makes output of */
	/* row IDCT scaled by 8 to retain 3 bits precision*/
	a = ((r7 + r1) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r3 + r2) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r0 + r4) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r8 + r6) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r8 - r6) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r0 - r4) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r3 - r2) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;
	a = ((r7 - r1) >> 14);
	CLIP_RESULT(a)
	*comp++ = a;

	comp += offset;
	}
	}



	/*----------------------------------------------------------------------------
	; Return nothing or data or data pointer
	----------------------------------------------------------------------------*/
	return;
	}

	void idct(
	int blk, uint8 pred, uint8 *dst, int width)
	{
	/*----------------------------------------------------------------------------
	; Define all local variables
	----------------------------------------------------------------------------*/
	int i;
	int32 tmpBLK[64];
	int32 *tmpBLK32 = &tmpBLK[0];
	int32 r0, r1, r2, r3, r4, r5, r6, r7, r8; /* butterfly nodes */
	int32 a;
	int res;

	/*----------------------------------------------------------------------------
	; Function body here
	----------------------------------------------------------------------------*/
	/* two dimensional inverse discrete cosine transform */


	/* column (vertical) IDCT */
	for (i = B_SIZE - 1; i >= 0; i--)
	{
	/* initialize butterfly nodes at first stage */

	r1 = blk[B_SIZE * 4 + i] << 11;
	/* since row IDCT results have net left shift by 3 */
	/* this left shift by 8 gives net left shift by 11 */
	/* in order to maintain the same scale as that of */
	/* coefficients Wi */

	r2 = blk[B_SIZE * 6 + i];
	r3 = blk[B_SIZE * 2 + i];
	r4 = blk[B_SIZE * 1 + i];
	r5 = blk[B_SIZE * 7 + i];
	r6 = blk[B_SIZE * 5 + i];
	r7 = blk[B_SIZE * 3 + i];

	if (!(r1 \| r2 \| r3 \| r4 \| r5 \| r6 \| r7))
	{
	/* shortcut */
	/* execute if values of g(r,1) to g(r,7) in a column*/
	/* are all zeros */

	/* make output of IDCT >>3 or scaled by 1/8 and */
	/* with the proper rounding */
	a = (blk[B_SIZE * 0 + i]) << 3;
	tmpBLK32[B_SIZE * 0 + i] = a;
	tmpBLK32[B_SIZE * 1 + i] = a;
	tmpBLK32[B_SIZE * 2 + i] = a;
	tmpBLK32[B_SIZE * 3 + i] = a;
	tmpBLK32[B_SIZE * 4 + i] = a;
	tmpBLK32[B_SIZE * 5 + i] = a;
	tmpBLK32[B_SIZE * 6 + i] = a;
	tmpBLK32[B_SIZE * 7 + i] = a;
	}
	else
	{
	r0 = (blk[8 * 0 + i] << 11) + 128;

	/* first stage */

	r8 = W7 * (r4 + r5);
	r4 = (r8 + (W1 - W7) * r4);
	/* Multiplication with Wi increases the net left */
	/* shift from 11 to 14,we have to shift back by 3*/
	r5 = (r8 - (W1 + W7) * r5);
	r8 = W3 * (r6 + r7);
	r6 = (r8 - (W3 - W5) * r6);
	r7 = (r8 - (W3 + W5) * r7);

	/* second stage */
	r8 = r0 + r1;
	r0 -= r1;

	r1 = W6 * (r3 + r2);
	r2 = (r1 - (W2 + W6) * r2);
	r3 = (r1 + (W2 - W6) * r3);

	r1 = r4 + r6;
	r4 -= r6;
	r6 = r5 + r7;
	r5 -= r7;

	/* third stage */
	r7 = r8 + r3;
	r8 -= r3;
	r3 = r0 + r2;
	r0 -= r2;
	r2 = (181 * (r4 + r5) + 128) >> 8; /* rounding */
	r4 = (181 * (r4 - r5) + 128) >> 8;

	/* fourth stage */
	/* net shift of IDCT is >>3 after the following */
	/* shift operation, it makes output of 2-D IDCT */
	/* scaled by 1/8, that is scaled twice by */
	/* 1/(2sqrt(2)) for row IDCT and column IDCT. /
	/* see detail analysis in design doc. */
	tmpBLK32[0 + i] = (r7 + r1) >> 8;
	tmpBLK32[(1<<3) + i] = (r3 + r2) >> 8;
	tmpBLK32[(2<<3) + i] = (r0 + r4) >> 8;
	tmpBLK32[(3<<3) + i] = (r8 + r6) >> 8;
	tmpBLK32[(4<<3) + i] = (r8 - r6) >> 8;
	tmpBLK32[(5<<3) + i] = (r0 - r4) >> 8;
	tmpBLK32[(6<<3) + i] = (r3 - r2) >> 8;
	tmpBLK32[(7<<3) + i] = (r7 - r1) >> 8;
	}
	}
	/* row (horizontal) IDCT */
	for (i = B_SIZE - 1; i >= 0; i--)
	{
	/* initialize butterfly nodes at the first stage */

	r1 = ((int32)tmpBLK32[4+(i<<3)]) << 8;
	/* r1 left shift by 11 is to maintain the same */
	/* scale as that of coefficients (W1,...W7) */
	/* since blk[4] won't multiply with Wi. */
	/* see detail diagram in design document. */

	r2 = tmpBLK32[6+(i<<3)];
	r3 = tmpBLK32[2+(i<<3)];
	r4 = tmpBLK32[1+(i<<3)];
	r5 = tmpBLK32[7+(i<<3)];
	r6 = tmpBLK32[5+(i<<3)];
	r7 = tmpBLK32[3+(i<<3)];

	if (!(r1 \| r2 \| r3 \| r4 \| r5 \| r6 \| r7))
	{
	/* shortcut */
	/* execute if values of F(1,v) to F(7,v) in a row*/
	/* are all zeros */

	/* output of row IDCT scaled by 8 */
	a = (tmpBLK32[0+(i<<3)] + 32) >> 6;
	blk[0+(i<<3)] = a;
	blk[1+(i<<3)] = a;
	blk[2+(i<<3)] = a;
	blk[3+(i<<3)] = a;
	blk[4+(i<<3)] = a;
	blk[5+(i<<3)] = a;
	blk[6+(i<<3)] = a;
	blk[7+(i<<3)] = a;

	}

	else
	{
	/* for proper rounding in the fourth stage */
	r0 = (((int32)tmpBLK32[0+(i<<3)]) << 8) + 8192;

	/* first stage */

	r8 = W7 * (r4 + r5) + 4;
	r4 = (r8 + (W1 - W7) * r4) >> 3;
	r5 = (r8 - (W1 + W7) * r5) >> 3;

	r8 = W3 * (r6 + r7) + 4;
	r6 = (r8 - (W3 - W5) * r6) >> 3;
	r7 = (r8 - (W3 + W5) * r7) >> 3;

	/* second stage */
	r8 = r0 + r1;
	r0 -= r1;

	r1 = W6 * (r3 + r2) + 4;
	r2 = (r1 - (W2 + W6) * r2) >> 3;
	r3 = (r1 + (W2 - W6) * r3) >> 3;

	r1 = r4 + r6;
	r4 -= r6;
	r6 = r5 + r7;
	r5 -= r7;

	/* third stage */
	r7 = r8 + r3;
	r8 -= r3;
	r3 = r0 + r2;
	r0 -= r2;
	r2 = (181 * (r4 + r5) + 128) >> 8; /* rounding */
	r4 = (181 * (r4 - r5) + 128) >> 8;

	/* fourth stage */
	/* net shift of this function is <<3 after the */
	/* following shift operation, it makes output of */
	/* row IDCT scaled by 8 to retain 3 bits precision*/
	blk[0+(i<<3)] = (r7 + r1) >> 14;
	blk[1+(i<<3)] = (r3 + r2) >> 14;
	blk[2+(i<<3)] = (r0 + r4) >> 14;
	blk[3+(i<<3)] = (r8 + r6) >> 14;
	blk[4+(i<<3)] = (r8 - r6) >> 14;
	blk[5+(i<<3)] = (r0 - r4) >> 14;
	blk[6+(i<<3)] = (r3 - r2) >> 14;
	blk[7+(i<<3)] = (r7 - r1) >> 14;
	}
	/* add with prediction , 08/03/05 */
	res = (*pred++ + block[0+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[1+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[2+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[3+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[4+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[5+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[6+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;
	res = (*pred++ + block[7+(i<<3)]);
	CLIP_RESULT(res);
	*dst++ = res;

	pred += 8;
	dst += (width - 8);
	}



	/*----------------------------------------------------------------------------
	; Return nothing or data or data pointer
	----------------------------------------------------------------------------*/
	return;
	}

	#endif
	/*----------------------------------------------------------------------------
	; End Function: idct
	----------------------------------------------------------------------------*/