blob: b865f23c81d083872926375e70e742b95fb0bda7 [file] [log] [blame]
/* ------------------------------------------------------------------
* Copyright (C) 1998-2009 PacketVideo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied.
* See the License for the specific language governing permissions
* and limitations under the License.
* -------------------------------------------------------------------
*/
/*********************************************************************************/
/* Filename: sad_inline.h */
/* Description: Implementation for in-line functions used in dct.cpp */
/* Modified: */
/*********************************************************************************/
#ifndef _SAD_INLINE_H_
#define _SAD_INLINE_H_
#ifdef __cplusplus
extern "C"
{
#endif
#if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER */
__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
{
tmp = tmp - tmp2;
if (tmp > 0) sad += tmp;
else sad -= tmp;
return sad;
}
__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
{
int32 x7;
x7 = src2 ^ src1; /* check odd/even combination */
if ((uint32)src2 >= (uint32)src1)
{
src1 = src2 - src1; /* subs */
}
else
{
src1 = src1 - src2;
}
x7 = x7 ^ src1; /* only odd bytes need to add carry */
x7 = mask & ((uint32)x7 >> 1);
x7 = (x7 << 8) - x7;
src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */
return src1;
}
#define NUMBER 3
#define SHIFT 24
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 2
#undef SHIFT
#define SHIFT 16
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 1
#undef SHIFT
#define SHIFT 8
#include "sad_mb_offset.h"
__inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
{
int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
x9 = 0x80808080; /* const. */
x8 = (uintptr_t)ref & 0x3;
if (x8 == 3)
goto SadMBOffset3;
if (x8 == 2)
goto SadMBOffset2;
if (x8 == 1)
goto SadMBOffset1;
// x5 = (x4<<8)-x4; /* x5 = x4*255; */
x4 = x5 = 0;
x6 = 0xFFFF00FF;
ref -= lx;
blk -= 16;
x8 = 16;
LOOP_SAD0:
/****** process 8 pixels ******/
x10 = *((uint32*)(ref += lx));
x11 = *((uint32*)(ref + 4));
x12 = *((uint32*)(blk += 16));
x14 = *((uint32*)(blk + 4));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****** process 8 pixels ******/
x10 = *((uint32*)(ref + 8));
x11 = *((uint32*)(ref + 12));
x12 = *((uint32*)(blk + 8));
x14 = *((uint32*)(blk + 12));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
{
if (--x8)
{
goto LOOP_SAD0;
}
}
return ((uint32)x10 >> 16);
SadMBOffset3:
return sad_mb_offset3(ref, blk, lx, dmin);
SadMBOffset2:
return sad_mb_offset2(ref, blk, lx, dmin);
SadMBOffset1:
return sad_mb_offset1(ref, blk, lx, dmin);
}
#elif defined(__CC_ARM) /* only work with arm v5 */
__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
{
__asm
{
rsbs tmp, tmp, tmp2 ;
rsbmi tmp, tmp, #0 ;
add sad, sad, tmp ;
}
return sad;
}
__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
{
int32 x7;
__asm
{
EOR x7, src2, src1; /* check odd/even combination */
SUBS src1, src2, src1;
EOR x7, x7, src1;
AND x7, mask, x7, lsr #1;
ORRCC x7, x7, #0x80000000;
RSB x7, x7, x7, lsl #8;
ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
}
return src1;
}
__inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
{
int32 x7;
__asm
{
EOR x7, src2, src1; /* check odd/even combination */
ADDS src1, src2, src1;
EOR x7, x7, src1; /* only odd bytes need to add carry */
ANDS x7, mask, x7, rrx;
RSB x7, x7, x7, lsl #8;
SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
}
return src1;
}
#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \
BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \
ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \
SBC x5, x5, x11; /* accumulate low bytes */ \
BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \
ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */
#define NUMBER 3
#define SHIFT 24
#define INC_X8 0x08000001
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 2
#undef SHIFT
#define SHIFT 16
#undef INC_X8
#define INC_X8 0x10000001
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 1
#undef SHIFT
#define SHIFT 8
#undef INC_X8
#define INC_X8 0x08000001
#include "sad_mb_offset.h"
__inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
{
int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
x9 = 0x80808080; /* const. */
x4 = x5 = 0;
__asm
{
MOVS x8, ref, lsl #31 ;
BHI SadMBOffset3;
BCS SadMBOffset2;
BMI SadMBOffset1;
MVN x6, #0xFF00;
}
LOOP_SAD0:
/****** process 8 pixels ******/
x11 = *((int32*)(ref + 12));
x10 = *((int32*)(ref + 8));
x14 = *((int32*)(blk + 12));
x12 = *((int32*)(blk + 8));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
__asm
{
/****** process 8 pixels ******/
LDR x11, [ref, #4];
LDR x10, [ref], lx ;
LDR x14, [blk, #4];
LDR x12, [blk], #16 ;
}
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
__asm
{
/****************/
RSBS x11, dmin, x10, lsr #16;
ADDLSS x8, x8, #0x10000001;
BLS LOOP_SAD0;
}
return ((uint32)x10 >> 16);
SadMBOffset3:
return sad_mb_offset3(ref, blk, lx, dmin, x8);
SadMBOffset2:
return sad_mb_offset2(ref, blk, lx, dmin, x8);
SadMBOffset1:
return sad_mb_offset1(ref, blk, lx, dmin, x8);
}
#elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */
__inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
{
register int32 out;
register int32 temp1;
register int32 ss = sad;
register int32 tt = tmp;
register int32 uu = tmp2;
asm volatile("rsbs %1, %4, %3\n\t"
"rsbmi %1, %1, #0\n\t"
"add %0, %2, %1"
: "=&r"(out),
"=&r"(temp1)
: "r"(ss),
"r"(tt),
"r"(uu));
return out;
}
__inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
{
register int32 out;
register int32 temp1;
register int32 s1 = src1;
register int32 s2 = src2;
register int32 mm = mask;
asm volatile("eor %0, %3, %2\n\t"
"subs %1, %3, %2\n\t"
"eor %0, %0, %1\n\t"
"and %0, %4, %0, lsr #1\n\t"
"orrcc %0, %0, #0x80000000\n\t"
"rsb %0, %0, %0, lsl #8\n\t"
"add %1, %1, %0, asr #7\n\t"
"eor %1, %1, %0, asr #7"
: "=&r"(out),
"=&r"(temp1)
: "r"(s1),
"r"(s2),
"r"(mm));
return temp1;
}
__inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
{
register int32 out;
register int32 temp1;
register int32 s1 = src1;
register int32 s2 = src2;
register int32 mm = mask;
asm volatile("eor %1, %3, %2\n\t"
"adds %0, %3, %2\n\t"
"eor %1, %1, %0\n\t"
"ands %1, %4, %1,rrx\n\t"
"rsb %1, %1, %1, lsl #8\n\t"
"sub %0, %0, %1, asr #7\n\t"
"eor %0, %0, %1, asr #7"
: "=&r"(out),
"=&r"(temp1)
: "r"(s1),
"r"(s2),
"r"(mm));
return (out);
}
#define sum_accumulate asm volatile("sbc %0, %0, %1\n\t" \
"bic %1, %4, %1\n\t" \
"add %2, %2, %1, lsr #8\n\t" \
"sbc %0, %0, %3\n\t" \
"bic %3, %4, %3\n\t" \
"add %2, %2, %3, lsr #8" \
:"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
:"r"(x6));
#define NUMBER 3
#define SHIFT 24
#define INC_X8 0x08000001
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 2
#undef SHIFT
#define SHIFT 16
#undef INC_X8
#define INC_X8 0x10000001
#include "sad_mb_offset.h"
#undef NUMBER
#define NUMBER 1
#undef SHIFT
#define SHIFT 8
#undef INC_X8
#define INC_X8 0x08000001
#include "sad_mb_offset.h"
__inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
{
int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
x9 = 0x80808080; /* const. */
x4 = x5 = 0;
x8 = (uint32)ref & 0x3;
if (x8 == 3)
goto SadMBOffset3;
if (x8 == 2)
goto SadMBOffset2;
if (x8 == 1)
goto SadMBOffset1;
asm volatile("mvn %0, #0xFF00": "=r"(x6));
LOOP_SAD0:
/****** process 8 pixels ******/
x11 = *((int32*)(ref + 12));
x10 = *((int32*)(ref + 8));
x14 = *((int32*)(blk + 12));
x12 = *((int32*)(blk + 8));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
asm volatile("ldr %0, [%4, #4]\n\t"
"ldr %1, [%4], %6\n\t"
"ldr %2, [%5, #4]\n\t"
"ldr %3, [%5], #16"
: "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
: "r"(lx));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
{
if (--x8)
{
goto LOOP_SAD0;
}
}
return ((uint32)x10 >> 16);
SadMBOffset3:
return sad_mb_offset3(ref, blk, lx, dmin);
SadMBOffset2:
return sad_mb_offset2(ref, blk, lx, dmin);
SadMBOffset1:
return sad_mb_offset1(ref, blk, lx, dmin);
}
#endif // OS
#ifdef __cplusplus
}
#endif
#endif // _SAD_INLINE_H_