Add NV12ToNV24 and NV16ToNV24
These are bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter.
libyuv_unittest --gtest_filter=*ToNV24*
R=fbarchard@chromium.org
Change-Id: I3d98f833feeef00af3c903ac9ad0e41bdcbcb51f
Bug: libyuv:872
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2682152
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
diff --git a/README.chromium b/README.chromium
index bdd05f1..b96e823 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1775
+Version: 1776
License: BSD
License File: LICENSE
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 137b30f..7322300 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -289,6 +289,32 @@
int width,
int height);
+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index ee77d22..92759b2 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -77,12 +77,14 @@
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
-#define HAS_SCALECOLUP2LINEAR_SSE2
-#define HAS_SCALECOLUP2LINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_SSE2
#define HAS_SCALEROWUP2LINEAR_SSSE3
-#define HAS_SCALECOLUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
#endif
// The following are available for gcc/clang x86 platforms, but
@@ -92,10 +94,12 @@
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEUVROWDOWN2BOX_AVX2
-#define HAS_SCALECOLUP2LINEAR_AVX2
#define HAS_SCALEROWUP2LINEAR_AVX2
-#define HAS_SCALECOLUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_AVX2
#define HAS_SCALEROWUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2LINEAR_AVX2
+#define HAS_SCALEUVROWUP2BILINEAR_AVX2
#endif
// The following are available on all x86 platforms, but
@@ -124,10 +128,12 @@
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
-#define HAS_SCALECOLUP2LINEAR_NEON
#define HAS_SCALEROWUP2LINEAR_NEON
-#define HAS_SCALECOLUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_NEON
#define HAS_SCALEROWUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2LINEAR_NEON
+#define HAS_SCALEUVROWUP2BILINEAR_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -464,6 +470,24 @@
int src_stepx,
uint8_t* dst_uv,
int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,
int dst_width,
@@ -1163,6 +1187,55 @@
uint8_t* dst_ptr,
int dst_width);
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width);
+
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index a57dfa5..6073df8 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1775
+#define LIBYUV_VERSION 1776
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 98258b9..8a4fcf0 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -16,6 +16,7 @@
#include "libyuv/rotate.h"
#include "libyuv/row.h"
#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/scale_uv.h" // For UVScale()
#ifdef __cplusplus
namespace libyuv {
@@ -613,6 +614,55 @@
width, height);
}
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+ Abs(height), kFilterBilinear);
+ return 0;
+}
+
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+ dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,
diff --git a/source/scale.cc b/source/scale.cc
index 16771cd..226024c 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1415,27 +1415,27 @@
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
- assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+ assert(src_height == ((dst_height + 1) / 2));
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
}
@@ -1480,19 +1480,19 @@
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
}
@@ -1532,21 +1532,21 @@
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
- assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+ assert(src_height == ((dst_height + 1) / 2));
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
}
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 7939498..4257d17 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -640,7 +640,7 @@
0,
uint16_t)
-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
ScaleRowUp2_Linear_SSE2,
ScaleRowUp2_Linear_C,
@@ -648,7 +648,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
ScaleRowUp2_Linear_SSSE3,
ScaleRowUp2_Linear_C,
@@ -656,7 +656,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
ScaleRowUp2_Linear_16_SSE2,
ScaleRowUp2_Linear_16_C,
@@ -664,7 +664,7 @@
uint16_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
ScaleRowUp2_Linear_AVX2,
ScaleRowUp2_Linear_C,
@@ -672,7 +672,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
ScaleRowUp2_Linear_16_AVX2,
ScaleRowUp2_Linear_16_C,
@@ -680,7 +680,7 @@
uint16_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
ScaleRowUp2_Linear_NEON,
ScaleRowUp2_Linear_C,
@@ -688,7 +688,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
ScaleRowUp2_Linear_16_NEON,
ScaleRowUp2_Linear_16_C,
@@ -699,7 +699,7 @@
#undef SUH2LANY
// Scale up 2 times using bilinear filter.
-// This function produces 2 rows at a time
+// This function produces 2 rows at a time.
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
ptrdiff_t dst_stride, int dst_width) { \
@@ -736,7 +736,7 @@
0,
uint16_t)
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
ScaleRowUp2_Bilinear_SSE2,
ScaleRowUp2_Bilinear_C,
@@ -744,7 +744,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
ScaleRowUp2_Bilinear_16_SSE2,
ScaleRowUp2_Bilinear_16_C,
@@ -752,7 +752,7 @@
uint16_t)
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
ScaleRowUp2_Bilinear_SSSE3,
ScaleRowUp2_Bilinear_C,
@@ -760,7 +760,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
ScaleRowUp2_Bilinear_AVX2,
ScaleRowUp2_Bilinear_C,
@@ -768,7 +768,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
ScaleRowUp2_Bilinear_16_AVX2,
ScaleRowUp2_Bilinear_16_C,
@@ -776,7 +776,7 @@
uint16_t)
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
ScaleRowUp2_Bilinear_NEON,
ScaleRowUp2_Bilinear_C,
@@ -784,7 +784,7 @@
uint8_t)
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
ScaleRowUp2_Bilinear_16_NEON,
ScaleRowUp2_Bilinear_16_C,
@@ -794,6 +794,120 @@
#undef SU2BLANY
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ dst_ptr[1] = src_ptr[1]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 2, n); \
+ } \
+ C(src_ptr + n, dst_ptr + 2 * n + 2, r); \
+ } \
+ dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+ dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+ }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+ ScaleUVRowUp2_Linear_C,
+ ScaleUVRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+ ScaleUVRowUp2_Linear_SSSE3,
+ ScaleUVRowUp2_Linear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+ ScaleUVRowUp2_Linear_AVX2,
+ ScaleUVRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+ ScaleUVRowUp2_Linear_NEON,
+ ScaleUVRowUp2_Linear_C,
+ 7,
+ uint8_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0]) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0]) >> 2; \
+ da[1] = (3 * sa[1] + sb[1]) >> 2; \
+ db[1] = (sa[1] + 3 * sb[1]) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 2, db - da, n); \
+ } \
+ C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
+ } \
+ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
+ sb[((dst_width + 1) & ~1) - 2]) >> 2; \
+ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
+ 3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \
+ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
+ sb[((dst_width + 1) & ~1) - 1]) >> 2; \
+ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
+ 3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \
+ }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+ ScaleUVRowUp2_Bilinear_C,
+ ScaleUVRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+ ScaleUVRowUp2_Bilinear_SSSE3,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+ ScaleUVRowUp2_Bilinear_AVX2,
+ ScaleUVRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+ ScaleUVRowUp2_Bilinear_NEON,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#undef SBU2BLANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 8d41c03..4af8432 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1200,6 +1200,56 @@
}
}
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[4 * x + 0] =
+ (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 1] =
+ (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 2] =
+ (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+ dst_ptr[4 * x + 3] =
+ (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+ }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 1 + 8) >> 4;
+ d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 1 + 8) >> 4;
+ d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+ t[2 * x + 2] * 3 + 8) >> 4;
+ d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+ t[2 * x + 3] * 3 + 8) >> 4;
+ e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+ t[2 * x + 2] * 3 + 8) >> 4;
+ e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+ t[2 * x + 3] * 3 + 8) >> 4;
+ e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 9 + 8) >> 4;
+ e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 9 + 8) >> 4;
+ }
+}
+
// Scales a single row of pixels using point sampling.
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index db3c968..226e0a9 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -779,7 +779,7 @@
"xmm7");
}
-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
@@ -833,7 +833,7 @@
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -949,7 +949,7 @@
}
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
@@ -999,7 +999,7 @@
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
@@ -1106,7 +1106,7 @@
}
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
@@ -1149,7 +1149,7 @@
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -1236,7 +1236,7 @@
}
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
@@ -1281,7 +1281,7 @@
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@@ -1364,7 +1364,7 @@
}
#endif
-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
@@ -1450,7 +1450,7 @@
}
#endif
-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
@@ -2261,6 +2261,257 @@
}
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3,
+ 3, 1, 3, 1, 1, 3, 1, 3};
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqu %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31_SSSE3) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqu %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 2(%0,%3),%%xmm4 \n"
+ "punpcklbw %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm1,%%xmm3 \n"
+ "punpckldq %%xmm1,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31_SSSE3) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
+ 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3,
+ 1, 3, 3, 1, 3, 1, 1, 3, 1, 3};
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vmovdqu %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vmovdqu %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index e260dc9..fea3e64 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -509,20 +509,19 @@
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
- "vmov.u16 q15, #3 \n"
+ "vmov.u8 d30, #3 \n"
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // 01234567
- "vld1.8 {d2}, [%3]! \n" // 12345678
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%3]! \n" // 12345678
- "vmovl.u8 q0, d0 \n" // 01234567 (16b)
- "vmovl.u8 q1, d2 \n" // 12345678 (16b)
- "vmovq q2, q0 \n"
- "vmla.u16 q2, q1, q15 \n" // 3*near+far (odd)
- "vmla.u16 q1, q0, q15 \n" // 3*near+far (even)
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
- "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd)
- "vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even)
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
"vst2.8 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
@@ -548,25 +547,24 @@
asm volatile(
"vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // 01234567
- "vld1.8 {d2}, [%5]! \n" // 12345678
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%5]! \n" // 12345678
- "vmovl.u8 q0, d0 \n" // 01234567 (16b)
- "vmovl.u8 q1, d2 \n" // 12345678 (16b)
- "vmovq q2, q0 \n"
- "vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd)
- "vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even)
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
- "vld1.8 {d4}, [%1]! \n" // 01234567
- "vld1.8 {d6}, [%6]! \n" // 12345678
+ "vld1.8 {d8}, [%1]! \n"
+ "vld1.8 {d9}, [%6]! \n"
- "vmovl.u8 q2, d4 \n" // 01234567 (16b)
- "vmovl.u8 q3, d6 \n" // 12345678 (16b)
- "vmovq q4, q2 \n"
- "vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd)
- "vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even)
+ "vmovl.u8 q2, d8 \n"
+ "vmovl.u8 q3, d9 \n"
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
// e o
// q1 q0
@@ -600,7 +598,7 @@
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
- : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
"q15" // Clobber List
);
}
@@ -694,6 +692,105 @@
);
}
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "vmov.u8 d30, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v)
+ "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1}, [%2]! \n" // store
+ "vst2.16 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+ "q15" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 4b4f2fb..3a3d499 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -721,6 +721,101 @@
);
}
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 00112233 (1u1v)
+ "ldr d1, [%1], #8 \n" // 11223344 (1u1v)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store
+ "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n"
+ "ldr d1, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n"
+ "ushll v3.8h, v1.8b, #0 \n"
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n"
+ "ushll v5.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.8h, v4.8h \n"
+ "mov v1.8h, v5.8h \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1
+ "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2
+ "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 2235eeb..ab58966 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -649,6 +649,116 @@
}
#endif // HAS_SCALEUVBILINEARUP
+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+void ScaleUVLinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv) {
+ void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+void ScaleUVBilinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ if (src_height == 1) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+ } else {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO: Test performance of writing one row of destination at a time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+ }
+}
+
// Scale UV to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
@@ -844,6 +954,18 @@
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
+ if (filtering && src_height == dst_height) {
+ ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if ((clip_height + 1) / 2 == src_height &&
+ (clip_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
#if HAS_SCALEUVBILINEARUP
if (filtering && dy < 65536) {
ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 2070320..c7c5daf 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -458,6 +458,8 @@
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
+TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \