alpha blend argb into argb
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/428009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@203 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index d046e76..c54a85c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv

 URL: http://code.google.com/p/libyuv/

-Version: 201

+Version: 203

 License: BSD

 License File: LICENSE

 

diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 214235a..051f848 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -133,6 +133,11 @@
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
 
+// Alpha Blend ARGB
+int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 1c23058..e069731 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 201
+#define LIBYUV_VERSION 203
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e4c0d9d..72afce8 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -140,6 +140,43 @@
   return 0;
 }
 
+
+// Alpha Blend ARGB
+int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBBlendRow = ARGBBlendRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 // Convert I422 to ARGB.
 int I422ToARGB(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
diff --git a/source/row.h b/source/row.h
index f1da41e..14bc6dc 100644
--- a/source/row.h
+++ b/source/row.h
@@ -64,6 +64,11 @@
 #define HAS_UYVYTOUVROW_SSE2
 #endif
 
+#if defined(_MSC_VER)
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBBLENDROW_SSE2
+#endif
+
 // The following are available on Neon platforms
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_MIRRORROW_NEON
@@ -239,6 +244,10 @@
                      uint8* rgb_buf,
                      int width);
 
+void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+
 // 'Any' wrappers use memcpy()
 void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
                              const uint8* u_buf,
diff --git a/source/row_common.cc b/source/row_common.cc
index 30b1da6..224f7f4 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -452,6 +452,138 @@
   }
 }
 
+#define BLENDER(f, b, a) (f * a + b * (a ^ 0xff) + 0x80) >> 8
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint32 a = src_argb[3];
+    if (a) {
+      if (a < 255) {
+        const uint32 fb = src_argb[0];
+        const uint32 fg = src_argb[1];
+        const uint32 fr = src_argb[2];
+        const uint32 bb = dst_argb[0];
+        const uint32 bg = dst_argb[1];
+        const uint32 br = dst_argb[2];
+        dst_argb[0] = BLENDER(fb, bb, a);
+        dst_argb[1] = BLENDER(fg, bg, a);
+        dst_argb[2] = BLENDER(fr, br, a);
+        dst_argb[3] = 255u;
+      } else {
+        *(uint32*)dst_argb = *(uint32*)src_argb;
+      }
+    }
+    a = src_argb[4 + 3];
+    if (a) {
+      if (a < 255) {
+        const uint32 fb = src_argb[4 + 0];
+        const uint32 fg = src_argb[4 + 1];
+        const uint32 fr = src_argb[4 + 2];
+        const uint32 bb = dst_argb[4 + 0];
+        const uint32 bg = dst_argb[4 + 1];
+        const uint32 br = dst_argb[4 + 2];
+        dst_argb[4 + 0] = BLENDER(fb, bb, a);
+        dst_argb[4 + 1] = BLENDER(fg, bg, a);
+        dst_argb[4 + 2] = BLENDER(fr, br, a);
+        dst_argb[4 + 3] = 255u;
+      } else {
+        *(uint32*)(dst_argb + 4) = *(uint32*)(src_argb + 4);
+      }
+    }
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 a = src_argb[3];
+    if (a) {
+      if (a < 255) {
+        const uint32 fb = src_argb[0];
+        const uint32 fg = src_argb[1];
+        const uint32 fr = src_argb[2];
+        const uint32 bb = dst_argb[0];
+        const uint32 bg = dst_argb[1];
+        const uint32 br = dst_argb[2];
+        dst_argb[0] = BLENDER(fb, bb, a);
+        dst_argb[1] = BLENDER(fg, bg, a);
+        dst_argb[2] = BLENDER(fr, br, a);
+        dst_argb[3] = 255u;
+      } else {
+        *(uint32*)dst_argb = *(uint32*)src_argb;
+      }
+    }
+  }
+}
+
+#if 0
+void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint32 f = *(uint32*)src_argb;
+    uint32 a = f >> 24;
+    if (a) {
+      const uint32 b = *(uint32*)dst_argb;
+      if (a < 255) {
+        const uint32 src_rb = f & 0x00ff00ff;
+        const uint32 dst_rb = b & 0x00ff00ff;
+        const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+            0xff00ff00;
+
+        const uint32 src_g = f & 0x0000ff00;
+        const uint32 dst_g = b & 0x0000ff00;
+        const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+            0x00ff0000);
+
+        f = ((out_rb | out_g) >> 8) | 0xff000000;
+      }
+      *(uint32*)dst_argb = f;
+    }
+
+    f = *(uint32*)(src_argb + 4);
+    a = f >> 24;
+    if (a) {
+      const uint32 b = *(uint32*)(dst_argb + 4);
+      if (a < 255) {
+        const uint32 src_rb = f & 0x00ff00ff;
+        const uint32 dst_rb = b & 0x00ff00ff;
+        const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+            0xff00ff00;
+
+        const uint32 src_g = f & 0x0000ff00;
+        const uint32 dst_g = b & 0x0000ff00;
+        const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+            0x00ff0000);
+
+        f = ((out_rb | out_g) >> 8) | 0xff000000;
+      }
+      *(uint32*)(dst_argb + 4) = f;
+    }
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 f = *(uint32*)src_argb;
+    uint32 a = f >> 24;
+    if (a) {
+      const uint32 b = *(uint32*)dst_argb;
+      if (a < 255) {
+        const uint32 src_rb = f & 0x00ff00ff;
+        const uint32 dst_rb = b & 0x00ff00ff;
+        const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) &
+            0xff00ff00;
+
+        const uint32 src_g = f & 0x0000ff00;
+        const uint32 dst_g = b & 0x0000ff00;
+        const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) &
+            0x00ff0000);
+
+        f = ((out_rb | out_g) >> 8) | 0xff000000;
+      }
+      *(uint32*)dst_argb = f;
+    }
+  }
+}
+#endif
+
 // Wrappers to handle odd sizes/alignments
 #define MAKEYUVANY(NAMEANY, NAME, COPYROW)                                     \
     void NAMEANY(const uint8* y_buf,                                           \
diff --git a/source/row_win.cc b/source/row_win.cc
index 8b008e8..519edbb 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1909,6 +1909,121 @@
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for copying alpha
+static const uvec8 kShuffleAlpha = {
+  7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80
+};
+
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, 0x00200020      // rounding constant for 8.6 fixed point
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    mov        eax, 0x3f3f3f3f      // mask for alpha
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0
+    movdqa     xmm4, kShuffleAlpha
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    pcmpeqb    xmm6, xmm6       // generate 0x00010001 for negating
+    psrlw      xmm6, 15
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]      // fetch 2 pixels
+    movq       xmm1, qword ptr [eax + edx]
+    punpcklbw  xmm1, xmm0       // mix 2 pixels aArRgGbB_aArRgGbB
+    movdqa     xmm2, xmm1       // alpha from byte 7 and 15
+    pshufb     xmm2, xmm4
+    pxor       xmm2, xmm5
+    psrlw      xmm2, 2
+    pand       xmm2, xmm7
+    paddw      xmm2, xmm6       // -a = (a^255)+1
+    pmaddubsw  xmm1, xmm2
+    paddw      xmm1, xmm3       // round
+    psrlw      xmm1, 6
+
+    packuswb   xmm1, xmm1       // pack 2 pixels
+    sub        ecx, 2
+    movq       qword ptr [eax + edx], xmm1
+    lea        eax, [eax + 8]
+    ja         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// TODO(fbarchard): Single multiply method b+a(f-b)
+// TODO(fbarchard): Unroll and pair
+// TODO(fbarchard): Test for transparent and opaque common cases
+__declspec(naked)
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    pcmpeqb    xmm4, xmm4       // generate 0xffffffff do negative alpha
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    sub        ecx, 1
+    je         last1
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]      // fetch 2 pixels
+    movq       xmm1, qword ptr [eax + edx]
+    punpcklbw  xmm0, xmm0       // src 16 bits
+    punpcklbw  xmm1, xmm1       // dst 16 bits
+    pshuflw    xmm2, xmm0, 0xff // src alpha
+    pshufhw    xmm2, xmm2, 0xff
+    movdqa     xmm3, xmm2       // dst alpha
+    pxor       xmm3, xmm4
+    pmulhuw    xmm0, xmm2       // src * a
+    pmulhuw    xmm1, xmm3       // dst * (a ^ 0xffff)
+    paddw      xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0       // pack 2 pixels
+    sub        ecx, 2
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    ja         convertloop
+
+ last1:
+    add        ecx, 1
+    je         done
+
+    mov        ecx,  [eax]      // handle remaining pixel
+    movd       xmm0, ecx
+    mov        ecx,  [eax + edx]
+    movd       xmm1, ecx
+    punpcklbw  xmm0, xmm0       // src 16 bits
+    punpcklbw  xmm1, xmm1       // dst 16 bits
+    pshuflw    xmm2, xmm0, 0xff // src alpha
+    pshufhw    xmm2, xmm2, 0xff
+    movdqa     xmm3, xmm2       // dst alpha
+    pxor       xmm3, xmm4
+    pmulhuw    xmm0, xmm2       // src * a
+    pmulhuw    xmm1, xmm3       // dst * (a ^ 0xffff)
+    paddw      xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0       // pack 2 pixels
+
+    movd       ecx, xmm0
+    mov        dword ptr [eax + edx], ecx
+
+ done:
+
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
 #endif  // _M_IX86
 
 #ifdef __cplusplus
diff --git a/source/scale.cc b/source/scale.cc
index c2fcbb2..44ba937 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1699,20 +1699,21 @@
                               uint16* dst_ptr, int src_width, int src_height) {
   int tmp_height = 0;
   intptr_t tmp_src = 0;
+  intptr_t tmp_src_stride = static_cast<intptr_t>(src_stride);
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
-    "sub       $0x1,%5                         \n"
+    "sub       $0x1,%6                         \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "mov       %0,%3                           \n"
-    "add       %6,%0                           \n"
+    "add       %4,%0                           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm4,%%xmm0                   \n"
     "punpckhbw %%xmm4,%%xmm1                   \n"
-    "mov       %5,%2                           \n"
+    "mov       %6,%2                           \n"
   "2:                                          \n"
     "movdqa    (%0),%%xmm2                     \n"
-    "add       %6,%0                           \n"
+    "add       %4,%0                           \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
     "punpckhbw %%xmm4,%%xmm3                   \n"
@@ -1724,15 +1725,16 @@
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "lea       0x10(%3),%0                     \n"
     "lea       0x20(%1),%1                     \n"
-    "sub       $0x10,%4                        \n"
+    "sub       $0x10,%5                        \n"
     "ja        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(tmp_height),  // %2
     "+r"(tmp_src),     // %3
-    "+r"(src_width),  // %4
-    "+rm"(src_height)  // %5
-  : "rm"(static_cast<intptr_t>(src_stride))  // %6
+    "+r"(tmp_src_stride), // %4
+    "+rm"(src_width),  // %5
+    "+rm"(src_height)  // %6
+  :
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
@@ -1740,7 +1742,6 @@
   );
 }
 
-
 #if defined(__i386__)
 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width);