SwapUV AVX2 and SSSE3

Based on ARGBShuffle but with count adjusted and new shuffle mask

BUG=libyuv:809

Change-Id: Idd936ee6bedcf285607a68c2fc54d876b4becc01
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1711882
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
diff --git a/README.chromium b/README.chromium
index 41eae98..d372e45 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1733
+Version: 1734
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5cbdaad..bae8315 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -275,6 +275,7 @@
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
 #endif
 
 // The following are available for AVX2 gcc/clang x86 platforms:
@@ -295,6 +296,7 @@
 #define HAS_I422TOYUY2ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
+#define HAS_SWAPUVROW_AVX2
 // TODO(fbarchard): Fix AVX2 version of YUV24
 // #define HAS_NV21TOYUV24ROW_AVX2
 #endif
@@ -3374,6 +3376,10 @@
 void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
 void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
 void AYUVToUVRow_C(const uint8_t* src_ayuv,
                    int stride_ayuv,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 0f245bf..0f9a450 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1733
+#define LIBYUV_VERSION 1734
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index d739baa..5a9d56d 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -527,6 +527,22 @@
     src_stride_uv = dst_stride_vu = 0;
   }
 
+#if defined(HAS_SWAPUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SwapUVRow = SwapUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SwapUVRow = SwapUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SwapUVRow = SwapUVRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_SWAPUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SwapUVRow = SwapUVRow_Any_NEON;
diff --git a/source/row_any.cc b/source/row_any.cc
index ef89350..da91347 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -710,6 +710,12 @@
 #ifdef HAS_AYUVTOYROW_NEON
 ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
 #ifdef HAS_SWAPUVROW_NEON
 ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
 #endif
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 18b6350..7c6dee1 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -6790,6 +6790,68 @@
 }
 #endif  // HAS_NV21TOYUV24ROW_AVX2
 
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u,  3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+void SwapUVRow_SSSE3(const uint8_t* src_uv,
+                           uint8_t* dst_vu,
+                           int width) {
+  asm volatile(
+
+      "movdqu    %3,%%xmm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      : "m"(kShuffleUVToVU)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv,
+                          uint8_t* dst_vu,
+                          int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      : "m"(kShuffleUVToVU)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index c564ced..a71e752 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -994,6 +994,7 @@
 TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
 TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+// TODO(fbarchard): Investigate high error on Win32.
 TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10)
 TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
 TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)