ARGB To I420 and variations using row functions
BUG=none
TEST=media_unittests from talk used to benchmark
Review URL: http://webrtc-codereview.appspot.com/254001
git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index 0e1ab48..c1000e8 100644
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -20,6 +20,9 @@
// These flags are only valid on ARM processors
static const int kCpuHasNEON = 4;
+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 8;
+
// Detect CPU has SSE2 etc.
bool TestCpuFlag(int flag);
diff --git a/source/convert.cc b/source/convert.cc
index ee7af0c..8154dcb 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -636,185 +636,6 @@
return 0;
}
-// ARGBToI420Row_C etc row functions use the following macro, generating
-// code with RGB offsets/strides different for each version. Less error
-// prone than duplicating the code.
-// template could be used, but macro method works for C and asm and this is
-// performance critical code.
-
-#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \
-static void \
-NAME(const uint8* src_row0, const uint8* src_row1, \
- uint8* dst_yplane0, uint8* dst_yplane1, \
- uint8* dst_u, \
- uint8* dst_v, \
- int width) { \
- for (int x = 0; x < width - 1; x += 2) { \
- dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
- src_row0[G] * 129 + \
- src_row0[B] * 25 + 128) >> 8) + 16; \
- dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \
- src_row0[G + BPP] * 129 + \
- src_row0[B + BPP] * 25 + 128) >> 8) + 16; \
- dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
- src_row1[G] * 129 + \
- src_row1[B] * 25 + 128) >> 8) + 16; \
- dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
- src_row1[G + BPP] * 129 + \
- src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
- dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
- src_row1[R] + src_row1[R + BPP]) * -38 + \
- (src_row0[G] + src_row0[G + BPP] + \
- src_row1[G] + src_row1[G + BPP]) * -74 + \
- (src_row0[B] + src_row0[B + BPP] + \
- src_row1[B] + src_row1[B + BPP]) * 112 + \
- + 512) >> 10) + 128; \
- dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
- src_row1[R] + src_row1[R + BPP]) * 112 + \
- (src_row0[G] + src_row0[G + BPP] + \
- src_row1[G] + src_row1[G + BPP]) * -94 + \
- (src_row0[B] + src_row0[B + BPP] + \
- src_row1[B] + src_row1[B + BPP]) * -18 + \
- + 512) >> 10) + 128; \
- dst_yplane0 += 2; \
- dst_yplane1 += 2; \
- ++dst_u; \
- ++dst_v; \
- src_row0 += BPP * 2; \
- src_row1 += BPP * 2; \
- } \
- if (width & 1) { \
- dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
- src_row0[G] * 129 + \
- src_row0[B] * 25 + 128) >> 8) + 16; \
- dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
- src_row1[G] * 129 + \
- src_row1[B] * 25 + 128) >> 8) + 16; \
- dst_u[0] = (uint8)(((src_row0[R] + \
- src_row1[R]) * -38 + \
- (src_row0[G] + \
- src_row1[G]) * -74 + \
- (src_row0[B] + \
- src_row1[B]) * 112 + \
- + 256) >> 9) + 128; \
- dst_v[0] = (uint8)(((src_row0[R] + \
- src_row1[R]) * 112 + \
- (src_row0[G] + \
- src_row1[G]) * -94 + \
- (src_row0[B] + \
- src_row1[B]) * -18 + \
- + 256) >> 9) + 128; \
- } \
-}
-
-// Generate variations of RGBToI420. Parameters are r,g,b offsets within a
-// pixel, and number of bytes per pixel.
-MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4)
-MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4)
-MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
-MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
-MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
-
-static int RGBToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height,
- void (*RGBToI420Row)(const uint8* src_row0,
- const uint8* src_row1,
- uint8* dst_yplane0,
- uint8* dst_yplane1,
- uint8* dst_u,
- uint8* dst_v,
- int width)) {
- if (src_frame == NULL || dst_y == NULL ||
- dst_v == NULL || dst_v == NULL)
- return -1;
-
- if (height < 0) {
- height = -height;
- src_frame = src_frame + src_stride_frame * (height -1);
- src_stride_frame = -src_stride_frame;
- }
- for (int y = 0; y < height - 1; y += 2) {
- RGBToI420Row(src_frame, src_frame + src_stride_frame,
- dst_y, dst_y + dst_stride_y,
- dst_u, dst_v,
- width);
- src_frame += src_stride_frame * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- RGBToI420Row(src_frame, src_frame,
- dst_y, dst_y,
- dst_u, dst_v,
- width);
- }
- return 0;
-}
-
-int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return RGBToI420(src_frame, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height, ARGBToI420Row_C);
-}
-
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return RGBToI420(src_frame, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height, BGRAToI420Row_C);
-}
-
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return RGBToI420(src_frame, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height, ABGRToI420Row_C);
-}
-
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return RGBToI420(src_frame, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height, RGB24ToI420Row_C);
-}
-
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height) {
- return RGBToI420(src_frame, src_stride_frame,
- dst_y, dst_stride_y,
- dst_u, dst_stride_u,
- dst_v, dst_stride_v,
- width, height, RAWToI420Row_C);
-}
-
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -830,9 +651,9 @@
uint8* dst_u, uint8* dst_v, int width);
#if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 8 == 0) &&
+ (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
} else
#endif
@@ -841,10 +662,10 @@
}
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 8 == 0) &&
+ (width % 16 == 0) &&
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
- IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) &&
- IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) {
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
} else
#endif
@@ -853,17 +674,229 @@
}
for (int y = 0; y < (height - 1); y += 2) {
+ ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
ARGBToYRow(src_frame, dst_y, width);
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
- ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
src_frame += src_stride_frame * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
- ARGBToYRow(src_frame, dst_y, width);
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ }
+ return 0;
+}
+
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_frame = src_frame + (height - 1) * src_stride_frame;
+ src_stride_frame = -src_stride_frame;
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_BGRATOYROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+ ARGBToYRow = BGRAToYRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToYRow = BGRAToYRow_C;
+ }
+#if defined(HAS_BGRATOUVROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ ARGBToUVRow = BGRAToUVRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToUVRow = BGRAToUVRow_C;
+ }
+
+ for (int y = 0; y < (height - 1); y += 2) {
+ ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+ src_frame += src_stride_frame * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ }
+ return 0;
+}
+
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_frame = src_frame + (height - 1) * src_stride_frame;
+ src_stride_frame = -src_stride_frame;
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+ ARGBToYRow = ABGRToYRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToYRow = ABGRToYRow_C;
+ }
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ ARGBToUVRow = ABGRToUVRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToUVRow = ABGRToUVRow_C;
+ }
+
+ for (int y = 0; y < (height - 1); y += 2) {
+ ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+ src_frame += src_stride_frame * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ }
+ return 0;
+}
+
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_frame = src_frame + (height - 1) * src_stride_frame;
+ src_stride_frame = -src_stride_frame;
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_RGB24TOYROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+ ARGBToYRow = RGB24ToYRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToYRow = RGB24ToYRow_C;
+ }
+#if defined(HAS_RGB24TOUVROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ ARGBToUVRow = RGB24ToUVRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToUVRow = RGB24ToUVRow_C;
+ }
+
+ for (int y = 0; y < (height - 1); y += 2) {
+ ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+ src_frame += src_stride_frame * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ }
+ return 0;
+}
+
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_frame = src_frame + (height - 1) * src_stride_frame;
+ src_stride_frame = -src_stride_frame;
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#if defined(HAS_RAWTOYROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
+ ARGBToYRow = RAWToYRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToYRow = RAWToYRow_C;
+ }
+#if defined(HAS_RAWTOUVROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ ARGBToUVRow = RAWToUVRow_SSSE3;
+ } else
+#endif
+ {
+ ARGBToUVRow = RAWToUVRow_C;
+ }
+
+ for (int y = 0; y < (height - 1); y += 2) {
+ ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
+ ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
+ src_frame += src_stride_frame * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_frame, dst_y, width);
}
return 0;
}
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index fc388ba..cc44e21 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -15,9 +15,6 @@
#include <intrin.h>
#endif
-// Internal flag to indicate cpuid is initialized.
-static const int kCpuInitialized = 16;
-
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) {
@@ -64,11 +61,11 @@
void MaskCpuFlags(int enable_flags) {
InitCpuFlags();
- cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
+ cpu_info_ &= enable_flags;
}
bool TestCpuFlag(int flag) {
- if (!cpu_info_) {
+ if (0 == cpu_info_) {
InitCpuFlags();
}
return cpu_info_ & flag ? true : false;
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index db106bd..958f44c 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -14,6 +14,8 @@
#include "video_common.h"
#include "row.h"
+#define kMaxStride (2048 * 4)
+
namespace libyuv {
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
@@ -329,6 +331,9 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
+ if (width * 4 > kMaxStride) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -347,23 +352,29 @@
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
-#define kMaxStride (2048 * 4)
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+
#if defined(HAS_ARGBTOYROW_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 8 == 0) &&
+ (width % 16 == 0) &&
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
- IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
+ IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
ARGBToYRow = ARGBToYRow_SSSE3;
-#if defined(HAS_ARGBTOUVROW_SSSE3)
- ARGBToUVRow = ARGBToUVRow_SSSE3;
-#else
- ARGBToUVRow = ARGBToUVRow_C;
-#endif
} else
#endif
{
ARGBToYRow = ARGBToYRow_C;
+ }
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
+ IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+ IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ } else
+#endif
+ {
ARGBToUVRow = ARGBToUVRow_C;
}
@@ -392,9 +403,9 @@
BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
- ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
@@ -403,8 +414,8 @@
// TODO(fbarchard): Make sure this filters properly
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
- ARGBToYRow(row, dst_y, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
}
return 0;
}
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index b7984c0..a7e3e38 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -58,16 +58,6 @@
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
-// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
- 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
-
-// Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
- 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
-
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
#define HAS_SPLITUV_SSE2
__declspec(naked)
@@ -206,7 +196,7 @@
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
__asm__ volatile
(
- "vdup.u32 {q0}, %2 \n" // duplicate 4 ints
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1:\n"
"vst1.u32 {q0}, [%0]! \n" // store
"subs %1, %1, #16 \n" // 16 processed per loop
@@ -1282,85 +1272,6 @@
}
}
-#define HAS_BG24TOARGBROW_SSSE3
-__declspec(naked)
-static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
- int pix) {
-__asm {
- mov eax, [esp + 4] // src_bg24
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0xff000000
- pslld xmm7, 24
- movdqa xmm6, _kShuffleMaskBG24ToARGB
-
- convertloop :
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm6
- por xmm2, xmm7
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm6
- movdqa [edx + 32], xmm2
- por xmm0, xmm7
- pshufb xmm1, xmm6
- movdqa [edx], xmm0
- por xmm1, xmm7
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm6
- movdqa [edx + 16], xmm1
- por xmm3, xmm7
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- ja convertloop
- ret
- }
-}
-
-#define HAS_RAWTOARGBROW_SSSE3
-__declspec(naked)
-static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int pix) {
-__asm {
- mov eax, [esp + 4] // src_raw
- mov edx, [esp + 8] // dst_argb
- mov ecx, [esp + 12] // pix
- pcmpeqb xmm7, xmm7 // generate mask 0xff000000
- pslld xmm7, 24
- movdqa xmm6, _kShuffleMaskRAWToARGB
-
- convertloop :
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm3, [eax + 32]
- lea eax, [eax + 48]
- movdqa xmm2, xmm3
- palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
- pshufb xmm2, xmm6
- por xmm2, xmm7
- palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
- pshufb xmm0, xmm6
- movdqa [edx + 32], xmm2
- por xmm0, xmm7
- pshufb xmm1, xmm6
- movdqa [edx], xmm0
- por xmm1, xmm7
- palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
- pshufb xmm3, xmm6
- movdqa [edx + 16], xmm1
- por xmm3, xmm7
- movdqa [edx + 48], xmm3
- lea edx, [edx + 64]
- sub ecx, 16
- ja convertloop
- ret
- }
-}
#elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
@@ -1435,84 +1346,6 @@
);
}
-#define HAS_BG24TOARGBROW_SSSE3
-static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
- int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa 0x20(%0),%%xmm3\n"
- "lea 0x30(%0),%0\n"
- "movdqa %%xmm3,%%xmm2\n"
- "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
- "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
- "movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
- "movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
- "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
- "movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
- "movdqa %%xmm3,0x30(%1)\n"
- "lea 0x40(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src_bg24), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(kShuffleMaskBG24ToARGB) // %3
- : "memory"
-);
-}
-
-#define HAS_RAWTOARGBROW_SSSE3
-static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
- int pix) {
- asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
-"1:"
- "movdqa (%0),%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "movdqa 0x20(%0),%%xmm3\n"
- "lea 0x30(%0),%0\n"
- "movdqa %%xmm3,%%xmm2\n"
- "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
- "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
- "movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
- "movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
- "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
- "movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
- "movdqa %%xmm3,0x30(%1)\n"
- "lea 0x40(%1),%1\n"
- "sub $0x10,%2\n"
- "ja 1b\n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(pix) // %2
- : "r"(kShuffleMaskRAWToARGB) // %3
- : "memory"
-);
-}
-
#endif
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
@@ -1556,97 +1389,6 @@
return 0;
}
-
-static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_raw += 3;
- }
-}
-
-// Convert RAW to ARGB.
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (height < 0) {
- height = -height;
- src_raw = src_raw + (height - 1) * src_stride_raw;
- src_stride_raw = -src_stride_raw;
- }
- void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
-#if defined(HAS_RAWTOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- RAWToARGBRow = RAWToARGBRow_SSSE3;
- } else
-#endif
- {
- RAWToARGBRow = RAWToARGBRow_C;
- }
-
- for (int y = 0; y < height; ++y) {
- RAWToARGBRow(src_raw, dst_argb, width);
- src_raw += src_stride_raw;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
- for (int x = 0; x < pix; ++x) {
- uint8 b = src_bg24[0];
- uint8 g = src_bg24[1];
- uint8 r = src_bg24[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
- dst_argb[3] = 255u;
- dst_argb += 4;
- src_bg24 += 3;
- }
-}
-
-// Convert BG24 to ARGB.
-int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
- if (height < 0) {
- height = -height;
- src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
- src_stride_bg24 = -src_stride_bg24;
- }
- void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
-#if defined(HAS_BG24TOARGBROW_SSSE3)
- if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
- (width % 16 == 0) &&
- IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
- IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
- BG24ToARGBRow = BG24ToARGBRow_SSSE3;
- } else
-#endif
- {
- BG24ToARGBRow = BG24ToARGBRow_C;
- }
-
- for (int y = 0; y < height; ++y) {
- BG24ToARGBRow(src_bg24, dst_argb, width);
- src_bg24 += src_stride_bg24;
- dst_argb += dst_stride_argb;
- }
- return 0;
-}
-
-
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
for (int x = 0; x < pix; ++x) {
// To support in-place conversion.
@@ -1768,5 +1510,66 @@
return 0;
}
+
+// Convert RAW to ARGB.
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
+ IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ } else
+#endif
+ {
+ RAWToARGBRow = RAWToARGBRow_C;
+ }
+
+ for (int y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert BG24 to ARGB.
+int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (height < 0) {
+ height = -height;
+ src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
+ src_stride_bg24 = -src_stride_bg24;
+ }
+ void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
+#if defined(HAS_BG24TOARGBROW_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
+ IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+ BG24ToARGBRow = BG24ToARGBRow_SSSE3;
+ } else
+#endif
+ {
+ BG24ToARGBRow = BG24ToARGBRow_C;
+ }
+
+ for (int y = 0; y < height; ++y) {
+ BG24ToARGBRow(src_bg24, dst_argb, width);
+ src_bg24 += src_stride_bg24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
} // namespace libyuv
diff --git a/source/rotate.cc b/source/rotate.cc
index a1b05e8..43a0072 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -497,6 +497,143 @@
);
#if defined (__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+#define HAS_TRANSPOSE_WX8_FAST_SSSE3
+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile(
+"1:"
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ "movdqa (%0),%%xmm0\n"
+ "movdqa (%0,%3),%%xmm1\n"
+ "lea (%0,%3,2),%0\n"
+ "movdqa %%xmm0,%%xmm8\n"
+ "punpcklbw %%xmm1,%%xmm0\n"
+ "punpckhbw %%xmm1,%%xmm8\n"
+ "movdqa (%0),%%xmm2\n"
+ "movdqa %%xmm0,%%xmm1\n"
+ "movdqa %%xmm8,%%xmm9\n"
+ "palignr $0x8,%%xmm1,%%xmm1\n"
+ "palignr $0x8,%%xmm9,%%xmm9\n"
+ "movdqa (%0,%3),%%xmm3\n"
+ "lea (%0,%3,2),%0\n"
+ "movdqa %%xmm2,%%xmm10\n"
+ "punpcklbw %%xmm3,%%xmm2\n"
+ "punpckhbw %%xmm3,%%xmm10\n"
+ "movdqa %%xmm2,%%xmm3\n"
+ "movdqa %%xmm10,%%xmm11\n"
+ "movdqa (%0),%%xmm4\n"
+ "palignr $0x8,%%xmm3,%%xmm3\n"
+ "palignr $0x8,%%xmm11,%%xmm11\n"
+ "movdqa (%0,%3),%%xmm5\n"
+ "lea (%0,%3,2),%0\n"
+ "movdqa %%xmm4,%%xmm12\n"
+ "punpcklbw %%xmm5,%%xmm4\n"
+ "punpckhbw %%xmm5,%%xmm12\n"
+ "movdqa %%xmm4,%%xmm5\n"
+ "movdqa %%xmm12,%%xmm13\n"
+ "movdqa (%0),%%xmm6\n"
+ "palignr $0x8,%%xmm5,%%xmm5\n"
+ "palignr $0x8,%%xmm13,%%xmm13\n"
+ "movdqa (%0,%3),%%xmm7\n"
+ "lea (%0,%3,2),%0\n"
+ "movdqa %%xmm6,%%xmm14\n"
+ "punpcklbw %%xmm7,%%xmm6\n"
+ "punpckhbw %%xmm7,%%xmm14\n"
+ "neg %3\n"
+ "movdqa %%xmm6,%%xmm7\n"
+ "movdqa %%xmm14,%%xmm15\n"
+ "lea 0x10(%0,%3,8),%0\n"
+ "palignr $0x8,%%xmm7,%%xmm7\n"
+ "palignr $0x8,%%xmm15,%%xmm15\n"
+ "neg %3\n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0\n"
+ "punpcklwd %%xmm3,%%xmm1\n"
+ "movdqa %%xmm0,%%xmm2\n"
+ "movdqa %%xmm1,%%xmm3\n"
+ "palignr $0x8,%%xmm2,%%xmm2\n"
+ "palignr $0x8,%%xmm3,%%xmm3\n"
+ "punpcklwd %%xmm6,%%xmm4\n"
+ "punpcklwd %%xmm7,%%xmm5\n"
+ "movdqa %%xmm4,%%xmm6\n"
+ "movdqa %%xmm5,%%xmm7\n"
+ "palignr $0x8,%%xmm6,%%xmm6\n"
+ "palignr $0x8,%%xmm7,%%xmm7\n"
+ "punpcklwd %%xmm10,%%xmm8\n"
+ "punpcklwd %%xmm11,%%xmm9\n"
+ "movdqa %%xmm8,%%xmm10\n"
+ "movdqa %%xmm9,%%xmm11\n"
+ "palignr $0x8,%%xmm10,%%xmm10\n"
+ "palignr $0x8,%%xmm11,%%xmm11\n"
+ "punpcklwd %%xmm14,%%xmm12\n"
+ "punpcklwd %%xmm15,%%xmm13\n"
+ "movdqa %%xmm12,%%xmm14\n"
+ "movdqa %%xmm13,%%xmm15\n"
+ "palignr $0x8,%%xmm14,%%xmm14\n"
+ "palignr $0x8,%%xmm15,%%xmm15\n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0\n"
+ "movq %%xmm0,(%1)\n"
+ "movdqa %%xmm0,%%xmm4\n"
+ "palignr $0x8,%%xmm4,%%xmm4\n"
+ "movq %%xmm4,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "punpckldq %%xmm6,%%xmm2\n"
+ "movdqa %%xmm2,%%xmm6\n"
+ "movq %%xmm2,(%1)\n"
+ "palignr $0x8,%%xmm6,%%xmm6\n"
+ "punpckldq %%xmm5,%%xmm1\n"
+ "movq %%xmm6,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "movdqa %%xmm1,%%xmm5\n"
+ "movq %%xmm1,(%1)\n"
+ "palignr $0x8,%%xmm5,%%xmm5\n"
+ "movq %%xmm5,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "punpckldq %%xmm7,%%xmm3\n"
+ "movq %%xmm3,(%1)\n"
+ "movdqa %%xmm3,%%xmm7\n"
+ "palignr $0x8,%%xmm7,%%xmm7\n"
+ "movq %%xmm7,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "punpckldq %%xmm12,%%xmm8\n"
+ "movq %%xmm8,(%1)\n"
+ "movdqa %%xmm8,%%xmm12\n"
+ "palignr $0x8,%%xmm12,%%xmm12\n"
+ "movq %%xmm12,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "punpckldq %%xmm14,%%xmm10\n"
+ "movdqa %%xmm10,%%xmm14\n"
+ "movq %%xmm10,(%1)\n"
+ "palignr $0x8,%%xmm14,%%xmm14\n"
+ "punpckldq %%xmm13,%%xmm9\n"
+ "movq %%xmm14,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "movdqa %%xmm9,%%xmm13\n"
+ "movq %%xmm9,(%1)\n"
+ "palignr $0x8,%%xmm13,%%xmm13\n"
+ "movq %%xmm13,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "punpckldq %%xmm15,%%xmm11\n"
+ "movq %%xmm11,(%1)\n"
+ "movdqa %%xmm11,%%xmm15\n"
+ "palignr $0x8,%%xmm15,%%xmm15\n"
+ "movq %%xmm15,(%1,%4)\n"
+ "lea (%1,%4,2),%1\n"
+ "sub $0x10,%2\n"
+ "ja 1b\n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "r"(static_cast<intptr_t>(dst_stride)) // %4
+ : "memory"
+);
+}
+
#define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
@@ -644,17 +781,26 @@
#if defined(HAS_TRANSPOSE_WX8_NEON)
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
(width % 8 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
- IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
+ IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
+ IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_NEON;
TransposeWxH = TransposeWxH_C;
} else
#endif
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+ if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+ (width % 16 == 0) &&
+ IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
+ IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+ TransposeWx8 = TransposeWx8_FAST_SSSE3;
+ TransposeWxH = TransposeWxH_C;
+ } else
+#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
(width % 8 == 0) &&
- IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
- IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
+ IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
+ IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
TransposeWx8 = TransposeWx8_SSSE3;
TransposeWxH = TransposeWxH_C;
} else
diff --git a/source/row.h b/source/row.h
index 1563e95..85343c5 100644
--- a/source/row.h
+++ b/source/row.h
@@ -13,17 +13,91 @@
#include "libyuv/basic_types.h"
+// The following are available on all x86 platforms
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BG24TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOUVROW_SSSE3
+#define HAS_RAWTOUVROW_SSSE3
#endif
+// The following are available only on Windows
#if defined(WIN32) \
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
#endif
extern "C" {
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#endif
+#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
+#define HASRGB24TOYROW_SSSE3
+#endif
+#ifdef HASRGB24TOYROW_SSSE3
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+#endif
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
+#endif
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
+#endif
+
+#ifdef OSX
+extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
+extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
+extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
+#else
+extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
+extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
+extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
+#endif
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@@ -52,34 +126,6 @@
uint8* rgb_buf,
int width);
-#ifdef HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-#endif
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#else
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
-#endif
-
-#ifdef OSX
-extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-#else
-extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
-extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
-#endif
-
// Method to force C version.
//#define USE_MMX 0
//#define USE_SSE2 0
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 40e636c..88ce475 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -23,6 +23,16 @@
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
};
+// Shuffle table for converting BG24 to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile(
"movdqa (%3),%%xmm7\n"
@@ -55,47 +65,81 @@
}
#endif
-static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
+#ifdef HAS_BG24TOARGBROW_SSSE3
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+ asm volatile(
+ "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm7\n"
+ "movdqa (%3),%%xmm6\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "movdqa 0x10(%0),%%xmm1\n"
+ "movdqa 0x20(%0),%%xmm3\n"
+ "lea 0x30(%0),%0\n"
+ "movdqa %%xmm3,%%xmm2\n"
+ "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
+ "pshufb %%xmm6,%%xmm2\n"
+ "por %%xmm7,%%xmm2\n"
+ "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
+ "pshufb %%xmm6,%%xmm0\n"
+ "movdqa %%xmm2,0x20(%1)\n"
+ "por %%xmm7,%%xmm0\n"
+ "pshufb %%xmm6,%%xmm1\n"
+ "movdqa %%xmm0,(%1)\n"
+ "por %%xmm7,%%xmm1\n"
+ "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
+ "pshufb %%xmm6,%%xmm3\n"
+ "movdqa %%xmm1,0x10(%1)\n"
+ "por %%xmm7,%%xmm3\n"
+ "movdqa %%xmm3,0x30(%1)\n"
+ "lea 0x40(%1),%1\n"
+ "sub $0x10,%2\n"
+ "ja 1b\n"
+ : "+r"(src_bg24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(kShuffleMaskBG24ToARGB) // %3
+ : "memory"
+);
}
-static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
- return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile(
+ "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm7\n"
+ "movdqa (%3),%%xmm6\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "movdqa 0x10(%0),%%xmm1\n"
+ "movdqa 0x20(%0),%%xmm3\n"
+ "lea 0x30(%0),%0\n"
+ "movdqa %%xmm3,%%xmm2\n"
+ "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
+ "pshufb %%xmm6,%%xmm2\n"
+ "por %%xmm7,%%xmm2\n"
+ "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
+ "pshufb %%xmm6,%%xmm0\n"
+ "movdqa %%xmm2,0x20(%1)\n"
+ "por %%xmm7,%%xmm0\n"
+ "pshufb %%xmm6,%%xmm1\n"
+ "movdqa %%xmm0,(%1)\n"
+ "por %%xmm7,%%xmm1\n"
+ "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
+ "pshufb %%xmm6,%%xmm3\n"
+ "movdqa %%xmm1,0x10(%1)\n"
+ "por %%xmm7,%%xmm3\n"
+ "movdqa %%xmm3,0x30(%1)\n"
+ "lea 0x40(%1),%1\n"
+ "sub $0x10,%2\n"
+ "ja 1b\n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(kShuffleMaskRAWToARGB) // %3
+ : "memory"
+);
}
-static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
- return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
-}
-
-void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
- for (int x = 0; x < width; ++x) {
- dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
- src_argb0 += 4;
- dst_y += 1;
- }
-}
-
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- const uint8* src_argb1 = src_argb0 + src_stride_argb;
- for (int x = 0; x < width - 1; x += 2) {
- uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
- uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
- uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb0 += 8;
- src_argb1 += 8;
- dst_u += 1;
- dst_v += 1;
- }
- if (width & 1) {
- uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
- uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
- uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- }
-}
+#endif
#if defined(__x86_64__)
@@ -611,4 +655,5 @@
}
#endif
+
} // extern "C"
diff --git a/source/row_table.cc b/source/row_table.cc
index 7ce4a7e..022d9f8 100644
--- a/source/row_table.cc
+++ b/source/row_table.cc
@@ -10,6 +10,8 @@
#include "row.h"
+#define kMaxStride (2048 * 4)
+
extern "C" {
#define MAKETABLE(NAME) \
@@ -301,4 +303,167 @@
MAKETABLE(_kCoefficientsAbgrY)
#endif
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
+ for (int x = 0; x < pix; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
+ for (int x = 0; x < pix; ++x) {
+ uint8 b = src_bg24[0];
+ uint8 g = src_bg24[1];
+ uint8 r = src_bg24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_bg24 += 3;
+ }
+}
+
+// C versions do the same
+void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ BG24ToARGBRow_C(src_argb, row, pix);
+ ARGBToYRow_C(row, dst_y, pix);
+}
+
+void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ RAWToARGBRow_C(src_argb, row, pix);
+ ARGBToYRow_C(row, dst_y, pix);
+}
+
+void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ BG24ToARGBRow_C(src_argb, row, pix);
+ BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ RAWToARGBRow_C(src_argb, row, pix);
+ RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
+}
+
+static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+}
+static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
+}
+
+#define MAKEROWY(NAME,R,G,B) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ for (int x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += 4; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ for (int x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
+ src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
+ src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
+ src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += 8; \
+ src_rgb1 += 8; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+}
+
+MAKEROWY(ARGB,2,1,0)
+MAKEROWY(BGRA,1,2,3)
+MAKEROWY(ABGR,0,1,2)
+
+#if defined(HAS_RAWTOYROW_SSSE3)
+
+void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ BG24ToARGBRow_SSSE3(src_argb, row, pix);
+ ARGBToYRow_SSSE3(row, dst_y, pix);
+}
+
+void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ RAWToARGBRow_SSSE3(src_argb, row, pix);
+ ARGBToYRow_SSSE3(row, dst_y, pix);
+}
+
+#endif
+
+#if defined(HAS_RAWTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ BG24ToARGBRow_SSSE3(src_argb, row, pix);
+ BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ RAWToARGBRow_SSSE3(src_argb, row, pix);
+ RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+#else
+
+void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ BG24ToARGBRow_SSSE3(src_argb, row, pix);
+ BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ RAWToARGBRow_SSSE3(src_argb, row, pix);
+ RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
+}
+
+#endif
+#endif
+
} // extern "C"
diff --git a/source/row_win.cc b/source/row_win.cc
index c90372a..2bc5fb1 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -16,59 +16,160 @@
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const int8, kRGBToY[16]) = {
+extern "C" TALIGN16(const int8, kARGBToY[16]) = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
-extern "C" TALIGN16(const int8, kRGBToU[16]) = {
+extern "C" TALIGN16(const int8, kARGBToU[16]) = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
-extern "C" TALIGN16(const int8, kRGBToV[16]) = {
+extern "C" TALIGN16(const int8, kARGBToV[16]) = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
+// Constants for BGRA
+extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+extern "C" TALIGN16(const int8, kABGRToY[16]) = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+extern "C" TALIGN16(const int8, kABGRToU[16]) = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+extern "C" TALIGN16(const int8, kABGRToV[16]) = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
extern "C" TALIGN16(const uint8, kAddY16[16]) = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};
extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
- 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
- 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
+// Shuffle table for converting BG24 to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values
__declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
- mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_y
- mov ecx, [esp + 12] // pix
- movdqa xmm7, _kRGBToY
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm7, _kARGBToY
movdqa xmm6, _kAddY16
- pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
- psrld xmm5, 16
convertloop :
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm0, xmm7
- lea eax, [eax + 32]
- pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra
- palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
- paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
- pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
- palignr xmm3, xmm1, 2
- paddw xmm3, xmm1
- pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
- packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
- psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx
- packuswb xmm2, xmm2
- paddb xmm2, xmm6
- movq qword ptr [edx], xmm2
- lea edx, [edx + 8]
- sub ecx, 8
- ja convertloop
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm6
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ ja convertloop
+ ret
+ }
+}
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm7, _kBGRAToY
+ movdqa xmm6, _kAddY16
+
+ convertloop :
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm6
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ ja convertloop
+ ret
+ }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+__asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm7, _kABGRToY
+ movdqa xmm6, _kAddY16
+
+ convertloop :
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm6
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ ja convertloop
ret
}
}
@@ -84,55 +185,52 @@
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- movdqa xmm7, _kRGBToU
- movdqa xmm6, _kRGBToV
+ movdqa xmm7, _kARGBToU
+ movdqa xmm6, _kARGBToV
movdqa xmm5, _kAddUV128
- pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff
- psrld xmm4, 16
+ sub edi, edx // stride from u to v
convertloop :
- // step 1 - subsample 8x2 argb pixels to 4x1
- movdqa xmm0, [eax] // 32x2 -> 32x1
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // 32x1 -> 16x1
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
shufps xmm0, xmm1, 0x88
- shufps xmm2, xmm1, 0xdd
- pavgb xmm0, xmm2
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
- // instead of 8 different pixels, its 4 pixels of U and 4 of V
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm7 // U
- pmaddubsw xmm1, xmm6 // V
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
- palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
- paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
- pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
-
- palignr xmm3, xmm1, 2
- paddw xmm3, xmm1
- pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
-
- psraw xmm2, 8
- psraw xmm3, 8
- packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
- paddb xmm2, xmm5 // -> unsigned
- packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
-
- // step 3 - store 4 U and 4 V values
- movd dword ptr [edx], xmm2 // U
- lea edx, [edx + 4]
- pshufd xmm0, xmm2, 0x55 // V
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
- sub ecx, 8
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
ja convertloop
pop edi
pop esi
@@ -140,45 +238,208 @@
}
}
-static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
- return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
-}
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, _kBGRAToU
+ movdqa xmm6, _kBGRAToV
+ movdqa xmm5, _kAddUV128
+ sub edi, edx // stride from u to v
-static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
- return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
-}
-static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
- return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
-}
+ convertloop :
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
-void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
- for (int x = 0; x < width; ++x) {
- dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
- src_argb0 += 4;
- dst_y += 1;
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ ja convertloop
+ pop edi
+ pop esi
+ ret
}
}
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
- const uint8* src_argb1 = src_argb0 + src_stride_argb;
- for (int x = 0; x < width - 1; x += 2) {
- uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
- uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
- uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
- src_argb0 += 8;
- src_argb1 += 8;
- dst_u += 1;
- dst_v += 1;
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, _kABGRToU
+ movdqa xmm6, _kABGRToV
+ movdqa xmm5, _kAddUV128
+ sub edi, edx // stride from u to v
+
+ convertloop :
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ ja convertloop
+ pop edi
+ pop esi
+ ret
}
- if (width & 1) {
- uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
- uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
- uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
- dst_u[0] = RGBToU(ar, ag, ab);
- dst_v[0] = RGBToV(ar, ag, ab);
+}
+
+__declspec(naked)
+void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
+__asm {
+ mov eax, [esp + 4] // src_bg24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm7, xmm7 // generate mask 0xff000000
+ pslld xmm7, 24
+ movdqa xmm6, _kShuffleMaskBG24ToARGB
+
+ convertloop :
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm6
+ por xmm2, xmm7
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm6
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm7
+ pshufb xmm1, xmm6
+ movdqa [edx], xmm0
+ por xmm1, xmm7
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm6
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm7
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ ja convertloop
+ ret
+ }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+ int pix) {
+__asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm7, xmm7 // generate mask 0xff000000
+ pslld xmm7, 24
+ movdqa xmm6, _kShuffleMaskRAWToARGB
+
+ convertloop :
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm6
+ por xmm2, xmm7
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm6
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm7
+ pshufb xmm1, xmm6
+ movdqa [edx], xmm0
+ por xmm1, xmm7
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm6
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm7
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ ja convertloop
+ ret
}
}