Add RAWToI444

Skylake Xeon
  RAWToI444_Opt (433 ms)
  RAWToJ444_Opt (1781 ms)
  ARGBToI444_Opt (352 ms)
  ARGBToJ444_Opt (1577 ms)

Samsung S22 Exynos
  ARGBToI444_Opt (283 ms)
  ARGBToJ444_Opt (209 ms)
  RAWToI444_Opt (294 ms)
  RAWToJ444_Opt (293 ms)

Profiling on Samsung S22 Exynos
37.62%,  ARGBToUV444Row_NEON_I8MM
29.42%,  RAWToARGBRow_SVE2
19.61%,  ARGBToYRow_NEON_DotProd

Passing different --libyuv_cpu_info=N etc we can compare each ISA
C           1  RAWToI444_Opt (781 ms)
NEON      511  RAWToI444_Opt (757 ms)
NEONDOT  1023  RAWToI444_Opt (571 ms)
NEONI8MM 2047  RAWToI444_Opt (334 ms)
SVE2     8191  RAWToI444_Opt (307 ms)



Bug: 390247964
Change-Id: I0316fedd32222588455afa751f5b854f46bce024
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6223658
Reviewed-by: Wan-Teh Chang <wtc@google.com>
diff --git a/README.chromium b/README.chromium
index a0416e6..b44f26f 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1902
+Version: 1903
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 9fc6d34..750383a 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -924,6 +924,19 @@
               int width,
               int height);
 
+// RGB big endian (rgb in memory) to I444.
+LIBYUV_API
+int RAWToI444(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+
 // RGB big endian (rgb in memory) to J420.
 LIBYUV_API
 int RAWToJ420(const uint8_t* src_raw,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index d8fe213..9ee8af6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -398,7 +398,6 @@
 #define HAS_ARGBTOUVJ444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
-// TODO: Fix ARGBTOYROW and test ARGBToI444 tests pass.
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
 #define HAS_AYUVTOUVROW_NEON
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e0026d6..84f35c4 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1902
+#define LIBYUV_VERSION 1903
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 6bceb6d..6c37143 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -3521,6 +3521,218 @@
 }
 #undef HAS_RAWTOYJROW
 
+// RAW big endian (rgb in memory) to I444
+// 2 step conversion of RAWToARGB then ARGBToY and ARGBToUV444
+LIBYUV_API
+int RAWToI444(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_raw, uint8_t* dst_u, uint8_t* dst_v,
+                         int width) = ARGBToUV444Row_C;
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // TODO: add row coalesce when main loop handles large width in blocks
+  // TODO: implement UV444 or trim the ifdef below
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV444Row = ARGBToUV444Row_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYRow = ARGBToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYRow = ARGBToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYRow = ARGBToYRow_RVV;
+  }
+#endif
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToARGBRow = RAWToARGBRow_SVE2;
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToARGBRow = RAWToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToARGBRow = RAWToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToARGBRow = RAWToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToARGBRow = RAWToARGBRow_RVV;
+  }
+#endif
+
+  {
+    // Allocate a row of ARGB.
+    const int row_size = width * 4;
+    align_buffer_64(row, row_size);
+    if (!row)
+      return 1;
+
+    for (y = 0; y < height; ++y) {
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUV444Row(row, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      src_raw += src_stride_raw;
+      dst_y += dst_stride_y;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
+
 // RAW big endian (rgb in memory) to J444
 // 2 step conversion of RAWToARGB then ARGBToYJ and ARGBToUVJ444
 LIBYUV_API
@@ -3714,7 +3926,7 @@
 
   {
     // Allocate a row of ARGB.
-    const int row_size = (width * 4 + 31) & ~31;
+    const int row_size = width * 4;
     align_buffer_64(row, row_size);
     if (!row)
       return 1;
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 498f66f..be36343 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -678,6 +678,7 @@
 TESTATOPLANAR(I400, 1, 1, I420, 2, 2)
 TESTATOPLANAR(J400, 1, 1, J420, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, I420, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I444, 1, 1)
 TESTATOPLANAR(RAW, 3, 1, J420, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, J444, 1, 1)
 TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)