Add NV12ToNV24 and NV16ToNV24

These are bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter.

libyuv_unittest --gtest_filter=*ToNV24*

R=fbarchard@chromium.org

Change-Id: I3d98f833feeef00af3c903ac9ad0e41bdcbcb51f
Bug: libyuv:872
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2682152
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/README.chromium b/README.chromium
index bdd05f1..b96e823 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1775
+Version: 1776
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 137b30f..7322300 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -289,6 +289,32 @@
                int width,
                int height);
 
+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index ee77d22..92759b2 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -77,12 +77,14 @@
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 #define HAS_SCALEUVROWDOWN2BOX_SSSE3
-#define HAS_SCALECOLUP2LINEAR_SSE2
-#define HAS_SCALECOLUP2LINEAR_SSSE3
 #define HAS_SCALEROWUP2LINEAR_SSE2
 #define HAS_SCALEROWUP2LINEAR_SSSE3
-#define HAS_SCALECOLUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSSE3
 #define HAS_SCALEROWUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
 #endif
 
 // The following are available for gcc/clang x86 platforms, but
@@ -92,10 +94,12 @@
     (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_SCALEUVROWDOWN2BOX_AVX2
-#define HAS_SCALECOLUP2LINEAR_AVX2
 #define HAS_SCALEROWUP2LINEAR_AVX2
-#define HAS_SCALECOLUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_AVX2
 #define HAS_SCALEROWUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2LINEAR_AVX2
+#define HAS_SCALEUVROWUP2BILINEAR_AVX2
 #endif
 
 // The following are available on all x86 platforms, but
@@ -124,10 +128,12 @@
 #define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
-#define HAS_SCALECOLUP2LINEAR_NEON
 #define HAS_SCALEROWUP2LINEAR_NEON
-#define HAS_SCALECOLUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_NEON
 #define HAS_SCALEROWUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2LINEAR_NEON
+#define HAS_SCALEUVROWUP2BILINEAR_NEON
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -464,6 +470,24 @@
                              int src_stepx,
                              uint8_t* dst_uv,
                              int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
 void ScaleUVCols_C(uint8_t* dst_uv,
                    const uint8_t* src_uv,
                    int dst_width,
@@ -1163,6 +1187,55 @@
                                    uint8_t* dst_ptr,
                                    int dst_width);
 
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index a57dfa5..6073df8 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1775
+#define LIBYUV_VERSION 1776
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 98258b9..8a4fcf0 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -16,6 +16,7 @@
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/scale_uv.h" // For UVScale()
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -613,6 +614,55 @@
                     width, height);
 }
 
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+               Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+          SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+          Abs(height), kFilterBilinear);
+  return 0;
+}
+
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+               Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+          dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return 0;
+}
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
diff --git a/source/scale.cc b/source/scale.cc
index 16771cd..226024c 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1415,27 +1415,27 @@
 
   // This function can only scale up by 2 times.
   assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+  assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
   if (TestCpuFlag(kCpuHasSSE2)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
   }
@@ -1480,19 +1480,19 @@
   // This function can only scale up by 2 times horizontally.
   assert(src_width == ((dst_width + 1) / 2));
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
   }
@@ -1532,21 +1532,21 @@
 
   // This function can only scale up by 2 times.
   assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+  assert(src_height == ((dst_height + 1) / 2));
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
   if (TestCpuFlag(kCpuHasSSE2)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
   if (TestCpuFlag(kCpuHasAVX2)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
   }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
   if (TestCpuFlag(kCpuHasNEON)) {
     Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
   }
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 7939498..4257d17 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -640,7 +640,7 @@
          0,
          uint16_t)
 
-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
          ScaleRowUp2_Linear_SSE2,
          ScaleRowUp2_Linear_C,
@@ -648,7 +648,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
          ScaleRowUp2_Linear_SSSE3,
          ScaleRowUp2_Linear_C,
@@ -656,7 +656,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
          ScaleRowUp2_Linear_16_SSE2,
          ScaleRowUp2_Linear_16_C,
@@ -664,7 +664,7 @@
          uint16_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
          ScaleRowUp2_Linear_AVX2,
          ScaleRowUp2_Linear_C,
@@ -672,7 +672,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
          ScaleRowUp2_Linear_16_AVX2,
          ScaleRowUp2_Linear_16_C,
@@ -680,7 +680,7 @@
          uint16_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
 SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
          ScaleRowUp2_Linear_NEON,
          ScaleRowUp2_Linear_C,
@@ -688,7 +688,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
 SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
          ScaleRowUp2_Linear_16_NEON,
          ScaleRowUp2_Linear_16_C,
@@ -699,7 +699,7 @@
 #undef SUH2LANY
 
 // Scale up 2 times using bilinear filter.
-// This function produces 2 rows at a time
+// This function produces 2 rows at a time.
 #define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                            \
   void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
             ptrdiff_t dst_stride, int dst_width) {                      \
@@ -736,7 +736,7 @@
          0,
          uint16_t)
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
          ScaleRowUp2_Bilinear_SSE2,
          ScaleRowUp2_Bilinear_C,
@@ -744,7 +744,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
          ScaleRowUp2_Bilinear_16_SSE2,
          ScaleRowUp2_Bilinear_16_C,
@@ -752,7 +752,7 @@
          uint16_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
          ScaleRowUp2_Bilinear_SSSE3,
          ScaleRowUp2_Bilinear_C,
@@ -760,7 +760,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
 SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
          ScaleRowUp2_Bilinear_AVX2,
          ScaleRowUp2_Bilinear_C,
@@ -768,7 +768,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
          ScaleRowUp2_Bilinear_16_AVX2,
          ScaleRowUp2_Bilinear_16_C,
@@ -776,7 +776,7 @@
          uint16_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
 SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
          ScaleRowUp2_Bilinear_NEON,
          ScaleRowUp2_Bilinear_C,
@@ -784,7 +784,7 @@
          uint8_t)
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
          ScaleRowUp2_Bilinear_16_NEON,
          ScaleRowUp2_Bilinear_16_C,
@@ -794,6 +794,120 @@
 
 #undef SU2BLANY
 
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
+    int work_width = (dst_width - 1) & ~1;                            \
+    int r = work_width & MASK;                                        \
+    int n = work_width & ~MASK;                                       \
+    dst_ptr[0] = src_ptr[0];                                          \
+    dst_ptr[1] = src_ptr[1];                                          \
+    if (work_width > 0) {                                             \
+      if (n != 0) {                                                   \
+        SIMD(src_ptr, dst_ptr + 2, n);                                \
+      }                                                               \
+      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
+    }                                                                 \
+    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+  }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+          ScaleUVRowUp2_Linear_C,
+          ScaleUVRowUp2_Linear_C,
+          0,
+          uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+          ScaleUVRowUp2_Linear_SSSE3,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+          ScaleUVRowUp2_Linear_AVX2,
+          ScaleUVRowUp2_Linear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+          ScaleUVRowUp2_Linear_NEON,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+            ptrdiff_t dst_stride, int dst_width) {                      \
+    int work_width = (dst_width - 1) & ~1;                              \
+    int r = work_width & MASK;                                          \
+    int n = work_width & ~MASK;                                         \
+    const PTYPE* sa = src_ptr;                                          \
+    const PTYPE* sb = src_ptr + src_stride;                             \
+    PTYPE* da = dst_ptr;                                                \
+    PTYPE* db = dst_ptr + dst_stride;                                   \
+    da[0] = (3 * sa[0] + sb[0]) >> 2;                                   \
+    db[0] = (sa[0] + 3 * sb[0]) >> 2;                                   \
+    da[1] = (3 * sa[1] + sb[1]) >> 2;                                   \
+    db[1] = (sa[1] + 3 * sb[1]) >> 2;                                   \
+    if (work_width > 0) {                                               \
+      if (n != 0) {                                                     \
+        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
+      }                                                                 \
+      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
+    }                                                                   \
+    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
+                             sb[((dst_width + 1) & ~1) - 2]) >> 2;      \
+    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 2]) >> 2;  \
+    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
+                             sb[((dst_width + 1) & ~1) - 1]) >> 2;      \
+    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 1]) >> 2;  \
+  }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+          ScaleUVRowUp2_Bilinear_C,
+          ScaleUVRowUp2_Bilinear_C,
+          0,
+          uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+          ScaleUVRowUp2_Bilinear_SSSE3,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_AVX2,
+          ScaleUVRowUp2_Bilinear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+          ScaleUVRowUp2_Bilinear_NEON,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#undef SBU2BLANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 8d41c03..4af8432 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1200,6 +1200,56 @@
   }
 }
 
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[4 * x + 0] =
+        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 1] =
+        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 2] =
+        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+    dst_ptr[4 * x + 3] =
+        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+  }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 1 + 8) >> 4;
+    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 1 + 8) >> 4;
+    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                    t[2 * x + 2] * 3 + 8) >> 4;
+    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                    t[2 * x + 3] * 3 + 8) >> 4;
+    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                    t[2 * x + 2] * 3 + 8) >> 4;
+    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                    t[2 * x + 3] * 3 + 8) >> 4;
+    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 9 + 8) >> 4;
+    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 9 + 8) >> 4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleUVCols_C(uint8_t* dst_uv,
                    const uint8_t* src_uv,
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index db3c968..226e0a9 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -779,7 +779,7 @@
         "xmm7");
 }
 
-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
@@ -833,7 +833,7 @@
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
@@ -949,7 +949,7 @@
 }
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
@@ -999,7 +999,7 @@
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint16_t* dst_ptr,
@@ -1106,7 +1106,7 @@
 }
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
 static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
                                           3, 1, 1, 3, 3, 1, 1, 3};
 
@@ -1149,7 +1149,7 @@
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
@@ -1236,7 +1236,7 @@
 }
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
 static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
                                          3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
                                          1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
@@ -1281,7 +1281,7 @@
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
@@ -1364,7 +1364,7 @@
 }
 #endif
 
-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
 static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
                                              3, 1, 1, 3, 3, 1, 1, 3};
 
@@ -1450,7 +1450,7 @@
 }
 #endif
 
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                   ptrdiff_t src_stride,
                                   uint16_t* dst_ptr,
@@ -2261,6 +2261,257 @@
 }
 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
 
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3,
+                                            3, 1, 3, 1, 1, 3, 1, 3};
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqu      %3,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),              // %0
+        "+r"(dst_ptr),              // %1
+        "+r"(dst_width)             // %2
+      : "m"(kUVLinearMadd31_SSSE3)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqu      %5,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        2(%0,%3),%%xmm4               \n"
+      "punpcklbw   %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm1,%%xmm3                 \n"
+      "punpckldq   %%xmm1,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31_SSSE3)      // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
+                                           1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3,
+                                           1, 3, 3, 1, 3, 1, 1, 3, 1, 3};
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),             // %0
+        "+r"(dst_ptr),             // %1
+        "+r"(dst_width)            // %2
+      : "m"(kUVLinearMadd31_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+      // ymm0 ymm1
+      // ymm2 ymm3
+
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31_AVX2)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index e260dc9..fea3e64 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -509,20 +509,19 @@
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
   asm volatile(
-      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d30, #3                       \n"
 
       "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d2}, [%3]!                   \n"  // 12345678
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%3]!                   \n"  // 12345678
 
-      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q2, q1, q15                   \n"  // 3*near+far (odd)
-      "vmla.u16    q1, q0, q15                   \n"  // 3*near+far (even)
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
 
-      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u16  d1, q2, #2                    \n"  // 3/4*near+1/4*far (even)
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
 
       "vst2.8      {d0, d1}, [%1]!               \n"  // store
       "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
@@ -548,25 +547,24 @@
 
   asm volatile(
       "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"
 
       "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d2}, [%5]!                   \n"  // 12345678
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%5]!                   \n"  // 12345678
 
-      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (1, odd)
-      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (1, even)
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
 
-      "vld1.8      {d4}, [%1]!                   \n"  // 01234567
-      "vld1.8      {d6}, [%6]!                   \n"  // 12345678
+      "vld1.8      {d8}, [%1]!                   \n"
+      "vld1.8      {d9}, [%6]!                   \n"
 
-      "vmovl.u8    q2, d4                        \n"  // 01234567 (16b)
-      "vmovl.u8    q3, d6                        \n"  // 12345678 (16b)
-      "vmovq       q4, q2                        \n"
-      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (2, odd)
-      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (2, even)
+      "vmovl.u8    q2, d8                        \n"
+      "vmovl.u8    q3, d9                        \n"
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
 
       // e  o
       // q1 q0
@@ -600,7 +598,7 @@
         "+r"(src_temp),   // %5
         "+r"(src_temp1)   // %6
       :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
         "q15"  // Clobber List
   );
 }
@@ -694,6 +692,105 @@
   );
 }
 
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "vmov.u8     d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%3]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.16     {d0, d1}, [%1]!               \n"  // store
+      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "d30"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%5]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d8}, [%1]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d9}, [%6]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q2, d8                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q3, d9                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
+
+      // e  o
+      // q1 q0
+      // q3 q2
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      // e  o
+      // q5 q4
+      // q1 q0
+
+      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
+      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
+      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
+
+      "vst2.16     {d0, d1}, [%2]!               \n"  // store
+      "vst2.16     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+        "q15"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 4b4f2fb..3a3d499 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -721,6 +721,101 @@
   );
 }
 
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 00112233 (1u1v)
+      "ldr         d1, [%1], #8                  \n"  // 11223344 (1u1v)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 00112233 (1u1v, 16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 11223344 (1u1v, 16b)
+
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
+
+      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.4h, v2.4h}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+      "movi        v30.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"
+      "ldr         d1, [%2], #8                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"
+      "ushll       v3.8h, v1.8b, #0              \n"
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v4.8h, v0.8b, #0              \n"
+      "ushll       v5.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+
+      "mov         v0.8h, v4.8h                  \n"
+      "mov         v1.8h, v5.8h                  \n"
+      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
+      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
+      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
+      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 1
+      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 2
+      "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 2235eeb..ab58966 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -649,6 +649,116 @@
 }
 #endif  // HAS_SCALEUVBILINEARUP
 
+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+void ScaleUVLinearUp2(int src_width,
+                      int src_height,
+                      int dst_width,
+                      int dst_height,
+                      int src_stride,
+                      int dst_stride,
+                      const uint8_t* src_uv,
+                      uint8_t* dst_uv) {
+  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowUp2_Linear_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      dst_uv += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+void ScaleUVBilinearUp2(int src_width,
+                        int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_ptr,
+                        uint8_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleUVRowUp2_Bilinear_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+  }
+#endif
+
+  if (src_height == 1) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+  } else {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    dst_ptr += dst_stride;
+    for (x = 0; x < src_height - 1; ++x) {
+      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+      src_ptr += src_stride;
+      // TODO: Test performance of writing one row of destination at a time.
+      dst_ptr += 2 * dst_stride;
+    }
+    if (!(dst_height & 1)) {
+      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    }
+  }
+}
+
 // Scale UV to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
@@ -844,6 +954,18 @@
                        dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
   }
+  if (filtering && src_height == dst_height) {
+    ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+                     dst_stride, src, dst);
+    return;
+  }
+  if ((clip_height + 1) / 2 == src_height &&
+      (clip_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst);
+    return;
+  }
 #if HAS_SCALEUVBILINEARUP
   if (filtering && dy < 65536) {
     ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 2070320..c7c5daf 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -458,6 +458,8 @@
 
 TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
 TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
+TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
 
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \