HalfMergeUVPlane function and optimized I444ToNV12 and I444ToNV21
Bug: libyuv:858
Change-Id: Ie1f03a9acaff02ee8059cf1e5c2c2e5afcde8592
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2154608
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 30a6d3b..9caef1b 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -105,6 +105,19 @@
int width,
int height);
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Swap U and V channels in interleaved UV plane.
LIBYUV_API
void SwapUVPlane(const uint8_t* src_uv,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 45db1b9..02abff6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -273,6 +273,7 @@
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
@@ -343,7 +344,6 @@
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
-#define HAS_RGBATOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON
@@ -353,6 +353,7 @@
#define HAS_BYTETOFLOATROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I422ALPHATOARGBROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
@@ -375,19 +376,20 @@
#define HAS_NV21TORGB24ROW_NEON
#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTOUVROW_NEON
-#define HAS_RAWTOYROW_NEON
#define HAS_RAWTOYJROW_NEON
+#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGB24TOUVROW_NEON
-#define HAS_RGB24TOYROW_NEON
#define HAS_RGB24TOYJROW_NEON
+#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
#define HAS_RGB565TOUVROW_NEON
#define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON
@@ -1712,6 +1714,27 @@
uint8_t* dst_ptr,
int width);
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
diff --git a/source/convert.cc b/source/convert.cc
index 0645deb..ce1152d 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -426,7 +426,41 @@
dst_v, dst_stride_v, width, height, width, height);
}
-// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
+ return 0;
+}
+
LIBYUV_API
int I444ToNV21(const uint8_t* src_y,
int src_stride_y,
@@ -440,30 +474,9 @@
int dst_stride_vu,
int width,
int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
- return 0;
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
}
// I400 is greyscale typically used in MJPG
@@ -498,46 +511,6 @@
return 0;
}
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I444ToNV12(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_uv,
- int dst_stride_uv,
- int width,
- int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_u, halfwidth, plane_v, halfwidth, dst_uv, dst_stride_uv,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
- return 0;
-}
-
// I400 is greyscale typically used in MJPG
LIBYUV_API
int I400ToNV21(const uint8_t* src_y,
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 0c95f1f..25404af 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -488,7 +488,6 @@
return 0;
}
-// TODO(fbarchard): test negative height for invert.
LIBYUV_API
int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
@@ -502,12 +501,23 @@
int dst_stride_uv,
int width,
int height) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
- int halfwidth = (width + 1) / 2;
- int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index eea4fdc..20347cf 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -4103,6 +4103,52 @@
return 0;
}
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/row_common.cc b/source/row_common.cc
index 026845b..67b0c72 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3563,6 +3563,30 @@
}
}
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 9e7b9e5..e208856 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1078,6 +1078,8 @@
}
#endif
+// clang-format off
+
// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
// round parameter is register containing value to add before shift.
#define RGBTOY(round) \
@@ -1102,10 +1104,8 @@
"phaddw %%xmm0,%%xmm6 \n" \
"phaddw %%xmm2,%%xmm1 \n" \
"prefetcht0 1280(%0) \n" \
- "paddw %%" #round \
- ",%%xmm6 \n" \
- "paddw %%" #round \
- ",%%xmm1 \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
"psrlw $0x8,%%xmm6 \n" \
"psrlw $0x8,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm6 \n" \
@@ -1132,10 +1132,8 @@
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"prefetcht0 1280(%0) \n" \
- "vpaddw %%" #round \
- ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
- "vpaddw %%" #round \
- ",%%ymm2,%%ymm2 \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
@@ -1146,6 +1144,8 @@
"jg 1b \n" \
"vzeroupper \n"
+// clang-format on
+
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -7005,6 +7005,53 @@
}
#endif // HAS_SWAPUVROW_AVX2
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "1: \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/source/row_neon.cc b/source/row_neon.cc
index f358bd3..aecdf32 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2984,6 +2984,39 @@
: "cc", "memory", "q0", "q1", "q2");
}
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index b57147b..f9e0fd3 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3188,11 +3188,12 @@
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v2.8b, v1.8h, #2 \n"
+ "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@@ -3210,18 +3211,18 @@
asm volatile(
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v1.8b, v1.8h, #2 \n"
+ "prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
- "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@@ -3265,6 +3266,41 @@
: "cc", "memory", "v0", "v1", "v2");
}
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v2.16b \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index f97ad9a..31a4253 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -21,6 +21,7 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
+#include "libyuv/scale.h"
#ifdef ENABLE_ROW_TESTS
// row.h defines SIMD_ALIGNED, overriding unit_test.h
@@ -3479,4 +3480,50 @@
free_aligned_buffer_page_end(orig_pixels);
}
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+ // Round count up to multiple of 16
+ int dst_width = (benchmark_width_ + 1) / 2;
+ int dst_height = (benchmark_height_ + 1) / 2;
+ align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+ align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+ align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+ MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+ MemRandomize(tmp_pixels_u, dst_width * dst_height);
+ MemRandomize(tmp_pixels_v, dst_width * dst_height);
+ MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+ MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+ ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
+ benchmark_height_,
+
+ tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
+ ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
+ benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
+ kFilterBilinear);
+ MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
+ dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+ benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+ benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+ EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
+ free_aligned_buffer_page_end(tmp_pixels_u);
+ free_aligned_buffer_page_end(tmp_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_uv_opt);
+ free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
} // namespace libyuv