Use simple masking for AVX2 version of CopyAlpha so it can be implemented using a more generic bit mask function in future, and use more broadly known and optimized opcodes that will always be fast. Same performance as vblend.
BUG=none
TEST=CopyAlpha*
R=johannkoenig@google.com
Review URL: https://webrtc-codereview.appspot.com/2393005
git-svn-id: http://libyuv.googlecode.com/svn/trunk@813 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index ea02c60..73080d9 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 812
+Version: 813
License: BSD
License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 84f3bab..69a7076 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -168,7 +168,6 @@
// TODO(fbarchard): Optimize and enable
// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYALPHAROW_SSE41
// Caveat: Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
@@ -702,7 +701,6 @@
void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width);
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width);
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width);
void SetRow_X86(uint8* dst, uint32 v32, int count);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 564fd3f..cd64572 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 812
+#define LIBYUV_VERSION 813
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 65993c9..243a87f 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2195,11 +2195,6 @@
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
-#if defined(HAS_ARGBCOPYALPHAROW_SSE41)
- if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41;
- }
-#endif
#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
diff --git a/source/row_win.cc b/source/row_win.cc
index 3a8eaef..dd2152f 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -3640,35 +3640,6 @@
}
#endif // HAS_ARGBCOPYALPHAROW_SSE2
-#ifdef HAS_ARGBCOPYALPHAROW_SSE41
-// width in pixels
-__declspec(naked) __declspec(align(16))
-void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) {
- __asm {
- mov eax, [esp + 4] // src
- mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
- pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff
- psrld xmm0, 8
-
- align 4
- convertloop:
- movdqu xmm1, [eax]
- movdqu xmm2, [eax + 16]
- lea eax, [eax + 32]
- pblendvb xmm1, [edx], xmm0
- pblendvb xmm2, [edx + 16], xmm0
- movdqu [edx], xmm1
- movdqu [edx + 16], xmm2
- lea edx, [edx + 32]
- sub ecx, 8
- jg convertloop
-
- ret
- }
-}
-#endif // HAS_ARGBCOPYALPHAROW_SSE41
-
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
__declspec(naked) __declspec(align(16))
@@ -3677,18 +3648,21 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff
- vpsrld ymm0, ymm0, 8
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm1, ymm0, 8 // generate mask 0x00ffffff
+ vpslld ymm0, ymm0, 24 // generate mask 0xff000000
align 4
convertloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + 32]
+ vpand ymm2, ymm0, [eax]
+ vpand ymm3, ymm0, [eax + 32]
lea eax, [eax + 64]
- vpblendvb ymm1, ymm1, [edx], ymm0
- vpblendvb ymm2, ymm2, [edx + 32], ymm0
- vmovdqu [edx], ymm1
- vmovdqu [edx + 32], ymm2
+ vpand ymm4, ymm1, [edx]
+ vpand ymm5, ymm1, [edx + 32]
+ vpor ymm2, ymm2, ymm4
+ vpor ymm3, ymm3, ymm5
+ vmovdqu [edx], ymm2
+ vmovdqu [edx + 32], ymm3
lea edx, [edx + 64]
sub ecx, 16
jg convertloop
@@ -6958,7 +6932,8 @@
// 2 pixel loop.
align 16
convertloop:
-// (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 8]
punpcklbw xmm0, xmm3