alpha blend argb into argb
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/428009
git-svn-id: http://libyuv.googlecode.com/svn/trunk@203 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index d046e76..c54a85c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 201
+Version: 203
License: BSD
License File: LICENSE
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 214235a..051f848 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -133,6 +133,11 @@
uint8* dst_argb, int dst_stride_argb,
int width, int height);
+// Alpha Blend ARGB
+int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 1c23058..e069731 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 201
+#define LIBYUV_VERSION 203
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e4c0d9d..72afce8 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -140,6 +140,43 @@
return 0;
}
+
+// Alpha Blend ARGB
+int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBBlendRow = ARGBBlendRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
// Convert I422 to ARGB.
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
diff --git a/source/row.h b/source/row.h
index f1da41e..14bc6dc 100644
--- a/source/row.h
+++ b/source/row.h
@@ -64,6 +64,11 @@
#define HAS_UYVYTOUVROW_SSE2
#endif
+#if defined(_MSC_VER)
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBBLENDROW_SSE2
+#endif
+
// The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_MIRRORROW_NEON
@@ -239,6 +244,10 @@
uint8* rgb_buf,
int width);
+void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+
// 'Any' wrappers use memcpy()
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf,
diff --git a/source/row_common.cc b/source/row_common.cc
index 30b1da6..224f7f4 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -452,6 +452,138 @@
}
}
+#define BLENDER(f, b, a) (f * a + b * (a ^ 0xff) + 0x80) >> 8
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint32 a = src_argb[3];
+ if (a) {
+ if (a < 255) {
+ const uint32 fb = src_argb[0];
+ const uint32 fg = src_argb[1];
+ const uint32 fr = src_argb[2];
+ const uint32 bb = dst_argb[0];
+ const uint32 bg = dst_argb[1];
+ const uint32 br = dst_argb[2];
+ dst_argb[0] = BLENDER(fb, bb, a);
+ dst_argb[1] = BLENDER(fg, bg, a);
+ dst_argb[2] = BLENDER(fr, br, a);
+ dst_argb[3] = 255u;
+ } else {
+ *(uint32*)dst_argb = *(uint32*)src_argb;
+ }
+ }
+ a = src_argb[4 + 3];
+ if (a) {
+ if (a < 255) {
+ const uint32 fb = src_argb[4 + 0];
+ const uint32 fg = src_argb[4 + 1];
+ const uint32 fr = src_argb[4 + 2];
+ const uint32 bb = dst_argb[4 + 0];
+ const uint32 bg = dst_argb[4 + 1];
+ const uint32 br = dst_argb[4 + 2];
+ dst_argb[4 + 0] = BLENDER(fb, bb, a);
+ dst_argb[4 + 1] = BLENDER(fg, bg, a);
+ dst_argb[4 + 2] = BLENDER(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ } else {
+ *(uint32*)(dst_argb + 4) = *(uint32*)(src_argb + 4);
+ }
+ }
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32 a = src_argb[3];
+ if (a) {
+ if (a < 255) {
+ const uint32 fb = src_argb[0];
+ const uint32 fg = src_argb[1];
+ const uint32 fr = src_argb[2];
+ const uint32 bb = dst_argb[0];
+ const uint32 bg = dst_argb[1];
+ const uint32 br = dst_argb[2];
+ dst_argb[0] = BLENDER(fb, bb, a);
+ dst_argb[1] = BLENDER(fg, bg, a);
+ dst_argb[2] = BLENDER(fr, br, a);
+ dst_argb[3] = 255u;
+ } else {
+ *(uint32*)dst_argb = *(uint32*)src_argb;
+ }
+ }
+ }
+}
+
+#if 0
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint32 f = *(uint32*)src_argb;
+ uint32 a = f >> 24;
+ if (a) {
+ const uint32 b = *(uint32*)dst_argb;
+ if (a < 255) {
+ const uint32 src_rb = f & 0x00ff00ff;
+ const uint32 dst_rb = b & 0x00ff00ff;
+ const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+ 0xff00ff00;
+
+ const uint32 src_g = f & 0x0000ff00;
+ const uint32 dst_g = b & 0x0000ff00;
+ const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+ 0x00ff0000);
+
+ f = ((out_rb | out_g) >> 8) | 0xff000000;
+ }
+ *(uint32*)dst_argb = f;
+ }
+
+ f = *(uint32*)(src_argb + 4);
+ a = f >> 24;
+ if (a) {
+ const uint32 b = *(uint32*)(dst_argb + 4);
+ if (a < 255) {
+ const uint32 src_rb = f & 0x00ff00ff;
+ const uint32 dst_rb = b & 0x00ff00ff;
+ const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+ 0xff00ff00;
+
+ const uint32 src_g = f & 0x0000ff00;
+ const uint32 dst_g = b & 0x0000ff00;
+ const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+ 0x00ff0000);
+
+ f = ((out_rb | out_g) >> 8) | 0xff000000;
+ }
+ *(uint32*)(dst_argb + 4) = f;
+ }
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32 f = *(uint32*)src_argb;
+ uint32 a = f >> 24;
+ if (a) {
+ const uint32 b = *(uint32*)dst_argb;
+ if (a < 255) {
+ const uint32 src_rb = f & 0x00ff00ff;
+ const uint32 dst_rb = b & 0x00ff00ff;
+ const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+ 0xff00ff00;
+
+ const uint32 src_g = f & 0x0000ff00;
+ const uint32 dst_g = b & 0x0000ff00;
+ const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+ 0x00ff0000);
+
+ f = ((out_rb | out_g) >> 8) | 0xff000000;
+ }
+ *(uint32*)dst_argb = f;
+ }
+ }
+}
+#endif
+
// Wrappers to handle odd sizes/alignments
#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \
void NAMEANY(const uint8* y_buf, \
diff --git a/source/row_win.cc b/source/row_win.cc
index 8b008e8..519edbb 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1909,6 +1909,121 @@
}
#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for copying alpha
+static const uvec8 kShuffleAlpha = {
+ 7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80
+};
+
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, 0x00200020 // rounding constant for 8.6 fixed point
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ mov eax, 0x3f3f3f3f // mask for alpha
+ movd xmm7, eax
+ pshufd xmm7, xmm7, 0
+ movdqa xmm4, kShuffleAlpha
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ pcmpeqb xmm6, xmm6 // generate 0x00010001 for negating
+ psrlw xmm6, 15
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // fetch 2 pixels
+ movq xmm1, qword ptr [eax + edx]
+ punpcklbw xmm1, xmm0 // mix 2 pixels aArRgGbB_aArRgGbB
+ movdqa xmm2, xmm1 // alpha from byte 7 and 15
+ pshufb xmm2, xmm4
+ pxor xmm2, xmm5
+ psrlw xmm2, 2
+ pand xmm2, xmm7
+ paddw xmm2, xmm6 // -a = (a^255)+1
+ pmaddubsw xmm1, xmm2
+ paddw xmm1, xmm3 // round
+ psrlw xmm1, 6
+
+ packuswb xmm1, xmm1 // pack 2 pixels
+ sub ecx, 2
+ movq qword ptr [eax + edx], xmm1
+ lea eax, [eax + 8]
+ ja convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// TODO(fbarchard): Single multiply method b+a(f-b)
+// TODO(fbarchard): Unroll and pair
+// TODO(fbarchard): Test for transparent and opaque common cases
+__declspec(naked)
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub ecx, 1
+ je last1
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // fetch 2 pixels
+ movq xmm1, qword ptr [eax + edx]
+ punpcklbw xmm0, xmm0 // src 16 bits
+ punpcklbw xmm1, xmm1 // dst 16 bits
+ pshuflw xmm2, xmm0, 0xff // src alpha
+ pshufhw xmm2, xmm2, 0xff
+ movdqa xmm3, xmm2 // dst alpha
+ pxor xmm3, xmm4
+ pmulhuw xmm0, xmm2 // src * a
+ pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
+ paddw xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0 // pack 2 pixels
+ sub ecx, 2
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ ja convertloop
+
+ last1:
+ add ecx, 1
+ je done
+
+ mov ecx, [eax] // handle remaining pixel
+ movd xmm0, ecx
+ mov ecx, [eax + edx]
+ movd xmm1, ecx
+ punpcklbw xmm0, xmm0 // src 16 bits
+ punpcklbw xmm1, xmm1 // dst 16 bits
+ pshuflw xmm2, xmm0, 0xff // src alpha
+ pshufhw xmm2, xmm2, 0xff
+ movdqa xmm3, xmm2 // dst alpha
+ pxor xmm3, xmm4
+ pmulhuw xmm0, xmm2 // src * a
+ pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff)
+ paddw xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0 // pack 2 pixels
+
+ movd ecx, xmm0
+ mov dword ptr [eax + edx], ecx
+
+ done:
+
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
#endif // _M_IX86
#ifdef __cplusplus
diff --git a/source/scale.cc b/source/scale.cc
index c2fcbb2..44ba937 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1699,20 +1699,21 @@
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
+ intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
asm volatile (
"pxor %%xmm4,%%xmm4 \n"
- "sub $0x1,%5 \n"
+ "sub $0x1,%6 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"mov %0,%3 \n"
- "add %6,%0 \n"
+ "add %4,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
- "mov %5,%2 \n"
+ "mov %6,%2 \n"
"2: \n"
"movdqa (%0),%%xmm2 \n"
- "add %6,%0 \n"
+ "add %4,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
@@ -1724,15 +1725,16 @@
"movdqa %%xmm1,0x10(%1) \n"
"lea 0x10(%3),%0 \n"
"lea 0x20(%1),%1 \n"
- "sub $0x10,%4 \n"
+ "sub $0x10,%5 \n"
"ja 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
- "+r"(src_width), // %4
- "+rm"(src_height) // %5
- : "rm"(static_cast<intptr_t>(src_stride)) // %6
+ "+r"(tmp_src_stride), // %4
+ "+rm"(src_width), // %5
+ "+rm"(src_height) // %6
+ :
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
@@ -1740,7 +1742,6 @@
);
}
-
#if defined(__i386__)
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);