HalfMergeUVPlane function and optimized I444ToNV12 and I444ToNV21

Bug: libyuv:858
Change-Id: Ie1f03a9acaff02ee8059cf1e5c2c2e5afcde8592
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2154608
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 30a6d3b..9caef1b 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -105,6 +105,19 @@
                   int width,
                   int height);
 
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+
 // Swap U and V channels in interleaved UV plane.
 LIBYUV_API
 void SwapUVPlane(const uint8_t* src_uv,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 45db1b9..02abff6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -273,6 +273,7 @@
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
 // I210 is for H010.  2 = 422.  I for 601 vs H for 709.
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
@@ -343,7 +344,6 @@
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
-#define HAS_RGBATOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
 #define HAS_AYUVTOUVROW_NEON
 #define HAS_AYUVTOVUROW_NEON
@@ -353,6 +353,7 @@
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
 #define HAS_I400TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@@ -375,19 +376,20 @@
 #define HAS_NV21TORGB24ROW_NEON
 #define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTOUVROW_NEON
-#define HAS_RAWTOYROW_NEON
 #define HAS_RAWTOYJROW_NEON
+#define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGB24TOUVROW_NEON
-#define HAS_RGB24TOYROW_NEON
 #define HAS_RGB24TOYJROW_NEON
+#define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
 #define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
 #define HAS_SPLITRGBROW_NEON
@@ -1712,6 +1714,27 @@
                         uint8_t* dst_ptr,
                         int width);
 
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width);
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
diff --git a/source/convert.cc b/source/convert.cc
index 0645deb..ce1152d 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -426,7 +426,41 @@
                     dst_v, dst_stride_v, width, height, width, height);
 }
 
-// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                   dst_stride_uv, width, height);
+  return 0;
+}
+
 LIBYUV_API
 int I444ToNV21(const uint8_t* src_y,
                int src_stride_y,
@@ -440,30 +474,9 @@
                int dst_stride_vu,
                int width,
                int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
+  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
 }
 
 // I400 is greyscale typically used in MJPG
@@ -498,46 +511,6 @@
   return 0;
 }
 
-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I444ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_u, halfwidth, plane_v, halfwidth, dst_uv, dst_stride_uv,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
-}
-
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
 int I400ToNV21(const uint8_t* src_y,
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 0c95f1f..25404af 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -488,7 +488,6 @@
   return 0;
 }
 
-// TODO(fbarchard): test negative height for invert.
 LIBYUV_API
 int I420ToNV12(const uint8_t* src_y,
                int src_stride_y,
@@ -502,12 +501,23 @@
                int dst_stride_uv,
                int width,
                int height) {
+  int halfwidth = (width + 1) / 2;
+  int halfheight = (height + 1) / 2;
   if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
       height == 0) {
     return -1;
   }
-  int halfwidth = (width + 1) / 2;
-  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index eea4fdc..20347cf 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -4103,6 +4103,52 @@
   return 0;
 }
 
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height) {
+  int y;
+  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+                         const uint8_t* src_v, int src_stride_v,
+                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_NEON;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    // Merge a row of U and V into a row of UV.
+    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+    src_u += src_stride_u * 2;
+    src_v += src_stride_v * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_common.cc b/source/row_common.cc
index 026845b..67b0c72 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3563,6 +3563,30 @@
   }
 }
 
+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+                 src_u[src_stride_u + 1] + 2) >>
+                2;
+    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+                 src_v[src_stride_v + 1] + 2) >>
+                2;
+    src_u += 2;
+    src_v += 2;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 9e7b9e5..e208856 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1078,6 +1078,8 @@
 }
 #endif
 
+// clang-format off
+
 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
 // round parameter is register containing value to add before shift.
 #define RGBTOY(round)                            \
@@ -1102,10 +1104,8 @@
   "phaddw    %%xmm0,%%xmm6                   \n" \
   "phaddw    %%xmm2,%%xmm1                   \n" \
   "prefetcht0 1280(%0)                       \n" \
-  "paddw     %%" #round                          \
-  ",%%xmm6             \n"                       \
-  "paddw     %%" #round                          \
-  ",%%xmm1             \n"                       \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
   "psrlw     $0x8,%%xmm6                     \n" \
   "psrlw     $0x8,%%xmm1                     \n" \
   "packuswb  %%xmm1,%%xmm6                   \n" \
@@ -1132,10 +1132,8 @@
   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
   "prefetcht0 1280(%0)                       \n"                 \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm2,%%ymm2     \n"                                        \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
@@ -1146,6 +1144,8 @@
   "jg        1b                              \n"                 \
   "vzeroupper                                \n"
 
+// clang-format on
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -7005,6 +7005,53 @@
 }
 #endif  // HAS_SWAPUVROW_AVX2
 
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+      "1:                                        \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"  // load 16 U values
+      "movdqu    (%1),%%xmm1                     \n"  // load 16 V values
+      "movdqu    0(%0,%4,1),%%xmm2               \n"  // 16 from next row
+      "movdqu    0(%1,%5,1),%%xmm3               \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"  // half size
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x1,%%xmm0                     \n"
+      "psrlw     $0x1,%%xmm1                     \n"
+      "pavgw     %%xmm5,%%xmm0                   \n"
+      "pavgw     %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"  // store 8 UV pixels
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"  // 16 src pixels per loop
+      "jg        1b                              \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/row_neon.cc b/source/row_neon.cc
index f358bd3..aecdf32 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2984,6 +2984,39 @@
       : "cc", "memory", "q0", "q1", "q2");
 }
 
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 U values
+      "vld1.8     {q1}, [%2]!                    \n"  // load 16 V values
+      "vld1.8     {q2}, [%1]!                    \n"
+      "vld1.8     {q3}, [%3]!                    \n"
+      "vpaddl.u8  q0, q0                         \n"  // half size
+      "vpaddl.u8  q1, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"
+      "vpadal.u8  q1, q3                         \n"
+      "vqrshrn.u16 d0, q0, #2                    \n"
+      "vqrshrn.u16 d1, q1, #2                    \n"
+      "subs       %5, %5, #16                    \n"  // 16 src pixels per loop
+      "vst2.8     {d0, d1}, [%4]!                \n"  // store 8 UV pixels
+      "bgt        1b                             \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index b57147b..f9e0fd3 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3188,11 +3188,12 @@
       "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
       "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
       "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "prfm       pldl1keep, [%0, 448]           \n"
       "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
       "uqrshrn    v2.8b, v1.8h, #2               \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
       "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
-      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ayuv),    // %0
         "+r"(src_ayuv_1),  // %1
@@ -3210,18 +3211,18 @@
   asm volatile(
 
       "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                // pixels.
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
       "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
       "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
       "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
       "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
       "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "prfm       pldl1keep, [%0, 448]           \n"
       "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
       "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
       "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
-      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
       "b.gt       1b                             \n"
       : "+r"(src_ayuv),    // %0
         "+r"(src_ayuv_1),  // %1
@@ -3265,6 +3266,41 @@
       : "cc", "memory", "v0", "v1", "v2");
 }
 
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 U values
+      "ld1        {v1.16b}, [%2], #16            \n"  // load 16 V values
+      "ld1        {v2.16b}, [%1], #16            \n"
+      "ld1        {v3.16b}, [%3], #16            \n"
+      "uaddlp     v0.8h, v0.16b                  \n"  // half size
+      "uaddlp     v1.8h, v1.16b                  \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "uadalp     v0.8h, v2.16b                  \n"
+      "uadalp     v1.8h, v3.16b                  \n"
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "uqrshrn    v0.8b, v0.8h, #2               \n"
+      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "subs       %w5, %w5, #16                  \n"  // 16 src pixels per loop
+      "st2        {v0.8b, v1.8b}, [%4], #16      \n"  // store 8 UV pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index f97ad9a..31a4253 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -21,6 +21,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"
 
 #ifdef ENABLE_ROW_TESTS
 // row.h defines SIMD_ALIGNED, overriding unit_test.h
@@ -3479,4 +3480,50 @@
   free_aligned_buffer_page_end(orig_pixels);
 }
 
+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+  // Round count up to multiple of 16
+  int dst_width = (benchmark_width_ + 1) / 2;
+  int dst_height = (benchmark_height_ + 1) / 2;
+  align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+  align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+  align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+  MemRandomize(tmp_pixels_u, dst_width * dst_height);
+  MemRandomize(tmp_pixels_v, dst_width * dst_height);
+  MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
+             benchmark_height_,
+
+             tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
+  ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
+             benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
+             kFilterBilinear);
+  MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
+               dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                     benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+                     benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+    EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(tmp_pixels_u);
+  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
 }  // namespace libyuv