Add RAWToI444
Skylake Xeon
RAWToI444_Opt (433 ms)
RAWToJ444_Opt (1781 ms)
ARGBToI444_Opt (352 ms)
ARGBToJ444_Opt (1577 ms)
Samsung S22 Exynos
ARGBToI444_Opt (283 ms)
ARGBToJ444_Opt (209 ms)
RAWToI444_Opt (294 ms)
RAWToJ444_Opt (293 ms)
Profiling on Samsung S22 Exynos
37.62%, ARGBToUV444Row_NEON_I8MM
29.42%, RAWToARGBRow_SVE2
19.61%, ARGBToYRow_NEON_DotProd
Passing different --libyuv_cpu_info=N etc we can compare each ISA
C 1 RAWToI444_Opt (781 ms)
NEON 511 RAWToI444_Opt (757 ms)
NEONDOT 1023 RAWToI444_Opt (571 ms)
NEONI8MM 2047 RAWToI444_Opt (334 ms)
SVE2 8191 RAWToI444_Opt (307 ms)
Bug: 390247964
Change-Id: I0316fedd32222588455afa751f5b854f46bce024
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6223658
Reviewed-by: Wan-Teh Chang <wtc@google.com>
diff --git a/README.chromium b/README.chromium
index a0416e6..b44f26f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1902
+Version: 1903
License: BSD
License File: LICENSE
Shipped: yes
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 9fc6d34..750383a 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -924,6 +924,19 @@
int width,
int height);
+// RGB big endian (rgb in memory) to I444.
+LIBYUV_API
+int RAWToI444(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
// RGB big endian (rgb in memory) to J420.
LIBYUV_API
int RAWToJ420(const uint8_t* src_raw,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index d8fe213..9ee8af6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -398,7 +398,6 @@
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
-// TODO: Fix ARGBTOYROW and test ARGBToI444 tests pass.
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e0026d6..84f35c4 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1902
+#define LIBYUV_VERSION 1903
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 6bceb6d..6c37143 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -3521,6 +3521,218 @@
}
#undef HAS_RAWTOYJROW
+// RAW big endian (rgb in memory) to I444
+// 2 step conversion of RAWToARGB then ARGBToY and ARGBToUV444
+LIBYUV_API
+int RAWToI444(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8_t* src_raw, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = ARGBToUV444Row_C;
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // TODO: add row coalesce when main loop handles large width in blocks
+ // TODO: implement UV444 or trim the ifdef below
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
+ if (TestCpuFlag(kCpuHasNeonI8MM)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV444Row = ARGBToUV444Row_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
+ if (TestCpuFlag(kCpuHasNeonDotProd)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_NEON_DotProd;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYRow = ARGBToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ ARGBToYRow = ARGBToYRow_RVV;
+ }
+#endif
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_SVE2)
+ if (TestCpuFlag(kCpuHasSVE2)) {
+ RAWToARGBRow = RAWToARGBRow_SVE2;
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToARGBRow = RAWToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+ if (TestCpuFlag(kCpuHasRVV)) {
+ RAWToARGBRow = RAWToARGBRow_RVV;
+ }
+#endif
+
+ {
+ // Allocate a row of ARGB.
+ const int row_size = width * 4;
+ align_buffer_64(row, row_size);
+ if (!row)
+ return 1;
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUV444Row(row, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ src_raw += src_stride_raw;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ free_aligned_buffer_64(row);
+ }
+ return 0;
+}
+
// RAW big endian (rgb in memory) to J444
// 2 step conversion of RAWToARGB then ARGBToYJ and ARGBToUVJ444
LIBYUV_API
@@ -3714,7 +3926,7 @@
{
// Allocate a row of ARGB.
- const int row_size = (width * 4 + 31) & ~31;
+ const int row_size = width * 4;
align_buffer_64(row, row_size);
if (!row)
return 1;
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 498f66f..be36343 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -678,6 +678,7 @@
TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I444, 1, 1)
TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
TESTATOPLANAR(RAW, 3, 1, J444, 1, 1)
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)