Convert8To16 use VPSRLW instead of VPMULHUW for better lunarlake performance
- MCA says old version was 4 cycles and new version is 2.5 cycles/loop
- lunarlake is the only known cpu
mca -mcpu=lunarlake 100 iterations
Was vpmulhu
Iterations: 100
Instructions: 1200
Total Cycles: 426
Total uOps: 1200
Dispatch Width: 8
uOps Per Cycle: 2.82
IPC: 2.82
Block RThroughput: 4.0
Now vpsrlw
Iterations: 100
Instructions: 1200
Total Cycles: 279
Total uOps: 1400
Dispatch Width: 8
uOps Per Cycle: 5.02
IPC: 4.30
Block RThroughput: 2.5
Bug: None
Change-Id: I5a49e1cf1ed3dfb59fe9861a871df9862417c6a6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6697745
Reviewed-by: richard winterton <rrwinterton@gmail.com>
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 6af2a1a..df0db47 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5106,31 +5106,30 @@
uint16_t* dst_y,
int scale,
int width) {
- asm volatile(
- "vmovd %3,%%xmm2 \n"
- "vpbroadcastw %%xmm2,%%ymm2 \n"
+ const int shift = __builtin_clz(scale) - 15;
+ asm volatile("vmovd %3,%%xmm2 \n"
- // 32 pixels per loop.
- LABELALIGN
+ // 32 pixels per loop.
+ LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm2,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm2,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"add $0x40,%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(src_y), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "r"(scale) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(shift) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_CONVERT8TO16ROW_AVX2
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 359cbf4..1f1a3bb 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3963,7 +3963,7 @@
uint8_t* dst_y,
int scale,
int width) {
- int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+ const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile(
"vdup.16 q2, %3 \n"
"1: \n"
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 9bef8c4..1f0d6e1 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3983,8 +3983,8 @@
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
+ const int y1_fraction = source_y_fraction;
+ const int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
"cmp %w4, #0 \n"
@@ -4119,10 +4119,10 @@
int scale,
int dst_width,
int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
+ const int y1_fraction = source_y_fraction;
+ const int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
- int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+ const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile(
"dup v6.8h, %w6 \n"
@@ -5529,7 +5529,7 @@
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
- int shift = 23 - __builtin_clz((int32_t)scale);
+ const int shift = 23 - __builtin_clz((int32_t)scale);
asm volatile(
"dup v2.8h, %w3 \n"
"1: \n"
@@ -5591,7 +5591,7 @@
// (src * 0x0101 * scale) >> 16.
// Since scale is a power of two, compute the shift to use to avoid needing
// to widen to int32.
- int shift = 15 - __builtin_clz(scale);
+ const int shift = 15 - __builtin_clz(scale);
asm volatile(
"dup v2.8h, %w[shift] \n"
"1: \n"
diff --git a/source/row_sme.cc b/source/row_sme.cc
index a78f741..bd61b20 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -569,7 +569,7 @@
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
- int shift = 23 - __builtin_clz((int32_t)scale);
+ const int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
"cntb %x[vl] \n"
@@ -917,7 +917,7 @@
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
- int shift = 23 - __builtin_clz((int32_t)scale);
+ const int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
@@ -977,8 +977,8 @@
int scale,
int width,
int source_y_fraction) {
- int y1_fraction = source_y_fraction;
- int y0_fraction = 256 - y1_fraction;
+ const int y1_fraction = source_y_fraction;
+ const int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
// y0_fraction == 0 is never called here.
@@ -994,7 +994,7 @@
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
- int shift = 23 - __builtin_clz((int32_t)scale);
+ const int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
@@ -1085,7 +1085,7 @@
// (src * 0x0101 * scale) >> 16.
// Since scale is a power of two, compute the shift to use to avoid needing
// to widen to int32.
- int shift = __builtin_clz(scale) - 15;
+ const int shift = __builtin_clz(scale) - 15;
uint64_t vl;
asm volatile(
diff --git a/source/row_win.cc b/source/row_win.cc
index 5d4aec9..933efec 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -184,7 +184,7 @@
// 32 bit
#else // defined(_M_X64)
-// if HAS_ARGBTOUVROW_SSSE3
+// ifdef HAS_ARGBTOUVROW_SSSE3
// 8 bit fixed point 0.5, for bias of UV.
static const ulvec8 kBiasUV128 = {