media/libstagefright/codecs/m4v_h263/enc/src/sad_inline.h - platform/frameworks/av - Git at Google

 /* ------------------------------------------------------------------
  * Copyright (C) 1998-2009 PacketVideo
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  * express or implied.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  * -------------------------------------------------------------------
  */
 /*********************************************************************************/
 /*  Filename: sad_inline.h                                                      */
 /*  Description: Implementation for in-line functions used in dct.cpp           */
 /*  Modified:                                                                   */
 /*********************************************************************************/
 #ifndef _SAD_INLINE_H_
 #define _SAD_INLINE_H_

 #ifdef __cplusplus
 extern "C"
 {
 #endif

 #if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER  */

     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     {
         tmp = tmp - tmp2;
         if (tmp > 0) sad += tmp;
         else sad -= tmp;

         return sad;
     }

     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
     {
         int32 x7;

         x7 = src2 ^ src1;       /* check odd/even combination */
         if ((uint32)src2 >= (uint32)src1)
         {
             src1 = src2 - src1;     /* subs */
         }
         else
         {
             src1 = src1 - src2;
         }
         x7 = x7 ^ src1;     /* only odd bytes need to add carry */
         x7 = mask & ((uint32)x7 >> 1);
         x7 = (x7 << 8) - x7;
         src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
         src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */

         return src1;
     }

 #define NUMBER 3
 #define SHIFT 24

 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 2
 #undef SHIFT
 #define SHIFT 16
 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 1
 #undef SHIFT
 #define SHIFT 8
 #include "sad_mb_offset.h"


     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
     {
         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

         x9 = 0x80808080; /* const. */

         x8 = (uintptr_t)ref & 0x3;
         if (x8 == 3)
             goto SadMBOffset3;
         if (x8 == 2)
             goto SadMBOffset2;
         if (x8 == 1)
             goto SadMBOffset1;

 //  x5 = (x4<<8)-x4; /* x5 = x4*255; */
         x4 = x5 = 0;

         x6 = 0xFFFF00FF;

         ref -= lx;
         blk -= 16;

         x8 = 16;

 LOOP_SAD0:
         /****** process 8 pixels ******/
         x10 = *((uint32*)(ref += lx));
         x11 = *((uint32*)(ref + 4));
         x12 = *((uint32*)(blk += 16));
         x14 = *((uint32*)(blk + 4));

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10; /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         /****** process 8 pixels ******/
         x10 = *((uint32*)(ref + 8));
         x11 = *((uint32*)(ref + 12));
         x12 = *((uint32*)(blk + 8));
         x14 = *((uint32*)(blk + 12));

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10;  /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         /****************/
         x10 = x5 - (x4 << 8); /* extract low bytes */
         x10 = x10 + x4;     /* add with high bytes */
         x10 = x10 + (x10 << 16); /* add with lower half word */

         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
         {
             if (--x8)
             {
                 goto LOOP_SAD0;
             }

         }

         return ((uint32)x10 >> 16);

 SadMBOffset3:

         return sad_mb_offset3(ref, blk, lx, dmin);

 SadMBOffset2:

         return sad_mb_offset2(ref, blk, lx, dmin);

 SadMBOffset1:

         return sad_mb_offset1(ref, blk, lx, dmin);

     }

 #elif defined(__CC_ARM)  /* only work with arm v5 */

     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     {
         __asm
         {
             rsbs    tmp, tmp, tmp2 ;
             rsbmi   tmp, tmp, #0 ;
             add     sad, sad, tmp ;
         }

         return sad;
     }

     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
     {
         int32 x7;

         __asm
         {
             EOR     x7, src2, src1;     /* check odd/even combination */
             SUBS    src1, src2, src1;
             EOR     x7, x7, src1;
             AND     x7, mask, x7, lsr #1;
             ORRCC   x7, x7, #0x80000000;
             RSB     x7, x7, x7, lsl #8;
             ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
             EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
         }

         return src1;
     }

     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
     {
         int32 x7;

         __asm
         {
             EOR      x7, src2, src1;        /* check odd/even combination */
             ADDS     src1, src2, src1;
             EOR      x7, x7, src1;      /* only odd bytes need to add carry */
             ANDS     x7, mask, x7, rrx;
             RSB      x7, x7, x7, lsl #8;
             SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
             EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
         }

         return src1;
     }

 #define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
         BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
         ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
         SBC      x5, x5, x11;    /* accumulate low bytes */ \
         BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
         ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */


 #define NUMBER 3
 #define SHIFT 24
 #define INC_X8 0x08000001

 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 2
 #undef SHIFT
 #define SHIFT 16
 #undef INC_X8
 #define INC_X8 0x10000001
 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 1
 #undef SHIFT
 #define SHIFT 8
 #undef INC_X8
 #define INC_X8 0x08000001
 #include "sad_mb_offset.h"


     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
     {
         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

         x9 = 0x80808080; /* const. */
         x4 = x5 = 0;

         __asm
         {
             MOVS    x8, ref, lsl #31 ;
             BHI     SadMBOffset3;
             BCS     SadMBOffset2;
             BMI     SadMBOffset1;

             MVN     x6, #0xFF00;
         }
 LOOP_SAD0:
         /****** process 8 pixels ******/
         x11 = *((int32*)(ref + 12));
         x10 = *((int32*)(ref + 8));
         x14 = *((int32*)(blk + 12));
         x12 = *((int32*)(blk + 8));

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10;  /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         __asm
         {
             /****** process 8 pixels ******/
             LDR     x11, [ref, #4];
             LDR     x10, [ref], lx ;
             LDR     x14, [blk, #4];
             LDR     x12, [blk], #16 ;
         }

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10;  /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         /****************/
         x10 = x5 - (x4 << 8); /* extract low bytes */
         x10 = x10 + x4;     /* add with high bytes */
         x10 = x10 + (x10 << 16); /* add with lower half word */

         __asm
         {
             /****************/
             RSBS    x11, dmin, x10, lsr #16;
             ADDLSS  x8, x8, #0x10000001;
             BLS     LOOP_SAD0;
         }

         return ((uint32)x10 >> 16);

 SadMBOffset3:

         return sad_mb_offset3(ref, blk, lx, dmin, x8);

 SadMBOffset2:

         return sad_mb_offset2(ref, blk, lx, dmin, x8);

 SadMBOffset1:

         return sad_mb_offset1(ref, blk, lx, dmin, x8);
     }


 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */

     __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
     {
         register int32 out;
         register int32 temp1;
         register int32 ss = sad;
         register int32 tt = tmp;
         register int32 uu = tmp2;

         asm volatile("rsbs  %1, %4, %3\n\t"
                      "rsbmi %1, %1, #0\n\t"
                      "add   %0, %2, %1"
              : "=&r"(out),
                      "=&r"(temp1)
                              : "r"(ss),
                              "r"(tt),
                              "r"(uu));
         return out;
     }

     __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
 {
         register int32 out;
         register int32 temp1;
         register int32 s1 = src1;
         register int32 s2 = src2;
         register int32 mm = mask;

         asm volatile("eor   %0, %3, %2\n\t"
                      "subs  %1, %3, %2\n\t"
                      "eor   %0, %0, %1\n\t"
                      "and   %0, %4, %0, lsr #1\n\t"
                      "orrcc %0, %0, #0x80000000\n\t"
                      "rsb   %0, %0, %0, lsl #8\n\t"
                      "add   %1, %1, %0, asr #7\n\t"
                      "eor   %1, %1, %0, asr #7"
              : "=&r"(out),
                      "=&r"(temp1)
                              : "r"(s1),
                              "r"(s2),
                              "r"(mm));

         return temp1;
     }

     __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
 {
         register int32 out;
         register int32 temp1;
         register int32 s1 = src1;
         register int32 s2 = src2;
         register int32 mm = mask;

         asm volatile("eor    %1, %3, %2\n\t"
                      "adds   %0, %3, %2\n\t"
                      "eor    %1, %1, %0\n\t"
                      "ands   %1, %4, %1,rrx\n\t"
                      "rsb    %1, %1, %1, lsl #8\n\t"
                      "sub    %0, %0, %1, asr #7\n\t"
                      "eor    %0, %0, %1, asr #7"
              : "=&r"(out),
                      "=&r"(temp1)
                              : "r"(s1),
                              "r"(s2),
                              "r"(mm));

         return (out);
     }

 #define sum_accumulate asm volatile("sbc  %0, %0, %1\n\t" \
                                 "bic  %1, %4, %1\n\t" \
                                 "add  %2, %2, %1, lsr #8\n\t" \
                                 "sbc  %0, %0, %3\n\t" \
                                 "bic  %3, %4, %3\n\t" \
                                 "add  %2, %2, %3, lsr #8" \
                                 :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
                                 :"r"(x6));

 #define NUMBER 3
 #define SHIFT 24
 #define INC_X8 0x08000001

 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 2
 #undef SHIFT
 #define SHIFT 16
 #undef INC_X8
 #define INC_X8 0x10000001
 #include "sad_mb_offset.h"

 #undef NUMBER
 #define NUMBER 1
 #undef SHIFT
 #define SHIFT 8
 #undef INC_X8
 #define INC_X8 0x08000001
 #include "sad_mb_offset.h"


     __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
 {
         int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

         x9 = 0x80808080; /* const. */
         x4 = x5 = 0;

         x8 = (uint32)ref & 0x3;
         if (x8 == 3)
             goto SadMBOffset3;
         if (x8 == 2)
             goto SadMBOffset2;
         if (x8 == 1)
             goto SadMBOffset1;

 asm volatile("mvn %0, #0xFF00": "=r"(x6));

 LOOP_SAD0:
         /****** process 8 pixels ******/
         x11 = *((int32*)(ref + 12));
         x10 = *((int32*)(ref + 8));
         x14 = *((int32*)(blk + 12));
         x12 = *((int32*)(blk + 8));

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10;  /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         asm volatile("ldr  %0, [%4, #4]\n\t"
                      "ldr  %1, [%4], %6\n\t"
                      "ldr  %2, [%5, #4]\n\t"
                      "ldr  %3, [%5], #16"
              : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
                              : "r"(lx));

         /* process x11 & x14 */
         x11 = sad_4pixel(x11, x14, x9);

         /* process x12 & x10 */
         x10 = sad_4pixel(x10, x12, x9);

         x5 = x5 + x10;  /* accumulate low bytes */
         x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
         x5 = x5 + x11;  /* accumulate low bytes */
         x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
         x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */

         /****************/
         x10 = x5 - (x4 << 8); /* extract low bytes */
         x10 = x10 + x4;     /* add with high bytes */
         x10 = x10 + (x10 << 16); /* add with lower half word */

         if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
         {
             if (--x8)
             {
                 goto LOOP_SAD0;
             }

         }

         return ((uint32)x10 >> 16);

 SadMBOffset3:

         return sad_mb_offset3(ref, blk, lx, dmin);

 SadMBOffset2:

         return sad_mb_offset2(ref, blk, lx, dmin);

 SadMBOffset1:

         return sad_mb_offset1(ref, blk, lx, dmin);
     }

 #endif // OS

 #ifdef __cplusplus
 }
 #endif

 #endif // _SAD_INLINE_H_
	/* ------------------------------------------------------------------
	* Copyright (C) 1998-2009 PacketVideo
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
	* express or implied.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	* -------------------------------------------------------------------
	*/
	/*********************************************************************************/
	/* Filename: sad_inline.h */
	/* Description: Implementation for in-line functions used in dct.cpp */
	/* Modified: */
	/*********************************************************************************/
	#ifndef _SAD_INLINE_H_
	#define _SAD_INLINE_H_

	#ifdef __cplusplus
	extern "C"
	{
	#endif

	#if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER */

	__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
	{
	tmp = tmp - tmp2;
	if (tmp > 0) sad += tmp;
	else sad -= tmp;

	return sad;
	}

	__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
	{
	int32 x7;

	x7 = src2 ^ src1; /* check odd/even combination */
	if ((uint32)src2 >= (uint32)src1)
	{
	src1 = src2 - src1; /* subs */
	}
	else
	{
	src1 = src1 - src2;
	}
	x7 = x7 ^ src1; /* only odd bytes need to add carry */
	x7 = mask & ((uint32)x7 >> 1);
	x7 = (x7 << 8) - x7;
	src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
	src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */

	return src1;
	}

	#define NUMBER 3
	#define SHIFT 24

	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 2
	#undef SHIFT
	#define SHIFT 16
	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 1
	#undef SHIFT
	#define SHIFT 8
	#include "sad_mb_offset.h"


	__inline int32 simd_sad_mb(UChar ref, UChar blk, Int dmin, Int lx)
	{
	int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

	x9 = 0x80808080; /* const. */

	x8 = (uintptr_t)ref & 0x3;
	if (x8 == 3)
	goto SadMBOffset3;
	if (x8 == 2)
	goto SadMBOffset2;
	if (x8 == 1)
	goto SadMBOffset1;

	// x5 = (x4<<8)-x4; /* x5 = x4255; /
	x4 = x5 = 0;

	x6 = 0xFFFF00FF;

	ref -= lx;
	blk -= 16;

	x8 = 16;

	LOOP_SAD0:
	/**** process 8 pixels ****/
	x10 = ((uint32)(ref += lx));
	x11 = ((uint32)(ref + 4));
	x12 = ((uint32)(blk += 16));
	x14 = ((uint32)(blk + 4));

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	/**** process 8 pixels ****/
	x10 = ((uint32)(ref + 8));
	x11 = ((uint32)(ref + 12));
	x12 = ((uint32)(blk + 8));
	x14 = ((uint32)(blk + 12));

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	/****************/
	x10 = x5 - (x4 << 8); /* extract low bytes */
	x10 = x10 + x4; /* add with high bytes */
	x10 = x10 + (x10 << 16); /* add with lower half word */

	if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
	{
	if (--x8)
	{
	goto LOOP_SAD0;
	}

	}

	return ((uint32)x10 >> 16);

	SadMBOffset3:

	return sad_mb_offset3(ref, blk, lx, dmin);

	SadMBOffset2:

	return sad_mb_offset2(ref, blk, lx, dmin);

	SadMBOffset1:

	return sad_mb_offset1(ref, blk, lx, dmin);

	}

	#elif defined(__CC_ARM) /* only work with arm v5 */

	__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
	{
	__asm
	{
	rsbs tmp, tmp, tmp2 ;
	rsbmi tmp, tmp, #0 ;
	add sad, sad, tmp ;
	}

	return sad;
	}

	__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
	{
	int32 x7;

	__asm
	{
	EOR x7, src2, src1; /* check odd/even combination */
	SUBS src1, src2, src1;
	EOR x7, x7, src1;
	AND x7, mask, x7, lsr #1;
	ORRCC x7, x7, #0x80000000;
	RSB x7, x7, x7, lsl #8;
	ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
	EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
	}

	return src1;
	}

	__inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
	{
	int32 x7;

	__asm
	{
	EOR x7, src2, src1; /* check odd/even combination */
	ADDS src1, src2, src1;
	EOR x7, x7, src1; /* only odd bytes need to add carry */
	ANDS x7, mask, x7, rrx;
	RSB x7, x7, x7, lsl #8;
	SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
	EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
	}

	return src1;
	}

	#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \
	BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \
	ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \
	SBC x5, x5, x11; /* accumulate low bytes */ \
	BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \
	ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */


	#define NUMBER 3
	#define SHIFT 24
	#define INC_X8 0x08000001

	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 2
	#undef SHIFT
	#define SHIFT 16
	#undef INC_X8
	#define INC_X8 0x10000001
	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 1
	#undef SHIFT
	#define SHIFT 8
	#undef INC_X8
	#define INC_X8 0x08000001
	#include "sad_mb_offset.h"


	__inline int32 simd_sad_mb(UChar ref, UChar blk, Int dmin, Int lx)
	{
	int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

	x9 = 0x80808080; /* const. */
	x4 = x5 = 0;

	__asm
	{
	MOVS x8, ref, lsl #31 ;
	BHI SadMBOffset3;
	BCS SadMBOffset2;
	BMI SadMBOffset1;

	MVN x6, #0xFF00;
	}
	LOOP_SAD0:
	/**** process 8 pixels ****/
	x11 = ((int32)(ref + 12));
	x10 = ((int32)(ref + 8));
	x14 = ((int32)(blk + 12));
	x12 = ((int32)(blk + 8));

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	__asm
	{
	/**** process 8 pixels ****/
	LDR x11, [ref, #4];
	LDR x10, [ref], lx ;
	LDR x14, [blk, #4];
	LDR x12, [blk], #16 ;
	}

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	/****************/
	x10 = x5 - (x4 << 8); /* extract low bytes */
	x10 = x10 + x4; /* add with high bytes */
	x10 = x10 + (x10 << 16); /* add with lower half word */

	__asm
	{
	/****************/
	RSBS x11, dmin, x10, lsr #16;
	ADDLSS x8, x8, #0x10000001;
	BLS LOOP_SAD0;
	}

	return ((uint32)x10 >> 16);

	SadMBOffset3:

	return sad_mb_offset3(ref, blk, lx, dmin, x8);

	SadMBOffset2:

	return sad_mb_offset2(ref, blk, lx, dmin, x8);

	SadMBOffset1:

	return sad_mb_offset1(ref, blk, lx, dmin, x8);
	}


	#elif ( defined(PV_ARM_GCC_V5) \|\| defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */

	__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
	{
	register int32 out;
	register int32 temp1;
	register int32 ss = sad;
	register int32 tt = tmp;
	register int32 uu = tmp2;

	asm volatile("rsbs %1, %4, %3\n\t"
	"rsbmi %1, %1, #0\n\t"
	"add %0, %2, %1"
	: "=&r"(out),
	"=&r"(temp1)
	: "r"(ss),
	"r"(tt),
	"r"(uu));
	return out;
	}

	__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
	{
	register int32 out;
	register int32 temp1;
	register int32 s1 = src1;
	register int32 s2 = src2;
	register int32 mm = mask;

	asm volatile("eor %0, %3, %2\n\t"
	"subs %1, %3, %2\n\t"
	"eor %0, %0, %1\n\t"
	"and %0, %4, %0, lsr #1\n\t"
	"orrcc %0, %0, #0x80000000\n\t"
	"rsb %0, %0, %0, lsl #8\n\t"
	"add %1, %1, %0, asr #7\n\t"
	"eor %1, %1, %0, asr #7"
	: "=&r"(out),
	"=&r"(temp1)
	: "r"(s1),
	"r"(s2),
	"r"(mm));

	return temp1;
	}

	__inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
	{
	register int32 out;
	register int32 temp1;
	register int32 s1 = src1;
	register int32 s2 = src2;
	register int32 mm = mask;

	asm volatile("eor %1, %3, %2\n\t"
	"adds %0, %3, %2\n\t"
	"eor %1, %1, %0\n\t"
	"ands %1, %4, %1,rrx\n\t"
	"rsb %1, %1, %1, lsl #8\n\t"
	"sub %0, %0, %1, asr #7\n\t"
	"eor %0, %0, %1, asr #7"
	: "=&r"(out),
	"=&r"(temp1)
	: "r"(s1),
	"r"(s2),
	"r"(mm));

	return (out);
	}

	#define sum_accumulate asm volatile("sbc %0, %0, %1\n\t" \
	"bic %1, %4, %1\n\t" \
	"add %2, %2, %1, lsr #8\n\t" \
	"sbc %0, %0, %3\n\t" \
	"bic %3, %4, %3\n\t" \
	"add %2, %2, %3, lsr #8" \
	:"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
	:"r"(x6));

	#define NUMBER 3
	#define SHIFT 24
	#define INC_X8 0x08000001

	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 2
	#undef SHIFT
	#define SHIFT 16
	#undef INC_X8
	#define INC_X8 0x10000001
	#include "sad_mb_offset.h"

	#undef NUMBER
	#define NUMBER 1
	#undef SHIFT
	#define SHIFT 8
	#undef INC_X8
	#define INC_X8 0x08000001
	#include "sad_mb_offset.h"


	__inline int32 simd_sad_mb(UChar ref, UChar blk, Int dmin, Int lx)
	{
	int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;

	x9 = 0x80808080; /* const. */
	x4 = x5 = 0;

	x8 = (uint32)ref & 0x3;
	if (x8 == 3)
	goto SadMBOffset3;
	if (x8 == 2)
	goto SadMBOffset2;
	if (x8 == 1)
	goto SadMBOffset1;

	asm volatile("mvn %0, #0xFF00": "=r"(x6));

	LOOP_SAD0:
	/**** process 8 pixels ****/
	x11 = ((int32)(ref + 12));
	x10 = ((int32)(ref + 8));
	x14 = ((int32)(blk + 12));
	x12 = ((int32)(blk + 8));

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	asm volatile("ldr %0, [%4, #4]\n\t"
	"ldr %1, [%4], %6\n\t"
	"ldr %2, [%5, #4]\n\t"
	"ldr %3, [%5], #16"
	: "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
	: "r"(lx));

	/* process x11 & x14 */
	x11 = sad_4pixel(x11, x14, x9);

	/* process x12 & x10 */
	x10 = sad_4pixel(x10, x12, x9);

	x5 = x5 + x10; /* accumulate low bytes */
	x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
	x5 = x5 + x11; /* accumulate low bytes */
	x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
	x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */

	/****************/
	x10 = x5 - (x4 << 8); /* extract low bytes */
	x10 = x10 + x4; /* add with high bytes */
	x10 = x10 + (x10 << 16); /* add with lower half word */

	if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
	{
	if (--x8)
	{
	goto LOOP_SAD0;
	}

	}

	return ((uint32)x10 >> 16);

	SadMBOffset3:

	return sad_mb_offset3(ref, blk, lx, dmin);

	SadMBOffset2:

	return sad_mb_offset2(ref, blk, lx, dmin);

	SadMBOffset1:

	return sad_mb_offset1(ref, blk, lx, dmin);
	}

	#endif // OS

	#ifdef __cplusplus
	}
	#endif

	#endif // _SAD_INLINE_H_