argb scale 2x upsample with specialization for 25/75%
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014
git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index 97118c9..2793fa7 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 485
+Version: 486
License: BSD
License File: LICENSE
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index c03128b..2a1468d 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 485
+#define LIBYUV_VERSION 486
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/scale.cc b/source/scale.cc
index e6484af..b6eaa35 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1035,24 +1035,26 @@
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
shr eax, 1
cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
+ sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop50
cmp eax, 96
je xloop25
- movd xmm0, eax // high fraction 0..127
+
+ movd xmm0, eax // high fraction 1..127.
neg eax
add eax, 128
- movd xmm5, eax // low fraction 128..1
+ movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
+ // General purpose row blend.
align 16
xloop:
movdqa xmm0, [esi]
@@ -1069,71 +1071,7 @@
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
-
- pop edi
- pop esi
- ret
-
- // Blend 100 / 0 - Copy row unchanged.
- align 16
- xloop100:
- movdqa xmm0, [esi]
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- // Blend 75 / 25.
- align 16
- xloop75:
- movdqa xmm1, [esi]
- movdqa xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- // Blend 50 / 50.
- align 16
- xloop50:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
+ jmp xloop99
// Blend 25 / 75.
align 16
@@ -1146,7 +1084,44 @@
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
+ jmp xloop99
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqa xmm0, [esi]
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ // Extrude last pixel.
+ xloop99:
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
@@ -1154,7 +1129,6 @@
pop edi
pop esi
ret
-
}
}
@@ -1171,29 +1145,31 @@
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
shr eax, 1
- cmp eax, 0
+ cmp eax, 0 // dispatch to specialized filters if applicable.
je xloop100
+ sub edi, esi
cmp eax, 32
je xloop75
cmp eax, 64
je xloop50
cmp eax, 96
je xloop25
- movd xmm0, eax // high fraction 0..127
+
+ movd xmm0, eax // high fraction 1..127.
neg eax
add eax, 128
- movd xmm5, eax // low fraction 128..1
+ movd xmm5, eax // low fraction 127..1.
punpcklbw xmm5, xmm0
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
+ // General purpose row blend.
align 16
xloop:
movdqu xmm0, [esi]
movdqu xmm2, [esi + edx]
- movdqu xmm1, xmm0
+ movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm5
@@ -1205,71 +1181,7 @@
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
-
- punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqu [esi + edi], xmm0
-
- pop edi
- pop esi
- ret
-
- // Blend 100 / 0 - Copy row unchanged.
- align 16
- xloop100:
- movdqu xmm0, [esi]
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop100
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqu [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- // Blend 75 / 25.
- align 16
- xloop75:
- movdqu xmm1, [esi]
- movdqu xmm0, [esi + edx]
- pavgb xmm0, xmm1
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop75
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqu [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- // Blend 50 / 50.
- align 16
- xloop50:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi + edx]
- pavgb xmm0, xmm1
- sub ecx, 16
- movdqu [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop50
-
- punpckhbw xmm0, xmm0
- pshufhw xmm0, xmm0, 0xff
- punpckhqdq xmm0, xmm0
- movdqu [esi + edi], xmm0
- pop edi
- pop esi
- ret
+ jmp xloop99
// Blend 25 / 75.
align 16
@@ -1282,7 +1194,44 @@
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop25
+ jmp xloop99
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqu xmm0, [esi]
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ // Extrude last pixel.
+ xloop99:
punpckhbw xmm0, xmm0
pshufhw xmm0, xmm0, 0xff
punpckhqdq xmm0, xmm0
@@ -1290,7 +1239,6 @@
pop edi
pop esi
ret
-
}
}
@@ -2068,9 +2016,13 @@
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
- "je 2f \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
"cmp $0x40,%3 \n"
- "je 3f \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
@@ -2078,6 +2030,8 @@
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
@@ -2094,25 +2048,57 @@
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
- "jmp 4f \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
".p2align 4 \n"
- "2: \n"
+ "25: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqa (%1),%%xmm1 \n"
+ "movdqa (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- ".p2align 4 \n"
- "4: \n"
+ "jg 100b \n"
+
+ // Extrude last pixel.
+ "99: \n"
"punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n"
@@ -2137,9 +2123,13 @@
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
- "je 2f \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
"cmp $0x40,%3 \n"
- "je 3f \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
@@ -2147,11 +2137,13 @@
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqu (%1),%%xmm0 \n"
"movdqu (%1,%4,1),%%xmm2 \n"
- "movdqu %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
"pmaddubsw %%xmm5,%%xmm0 \n"
@@ -2163,25 +2155,57 @@
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
- "jmp 4f \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
".p2align 4 \n"
- "2: \n"
+ "25: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
"movdqu (%1),%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqu (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- ".p2align 4 \n"
- "4: \n"
+ "jg 100b \n"
+
+ // Extrude last pixel.
+ "99: \n"
"punpckhbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm0 \n"
"punpckhqdq %%xmm0,%%xmm0 \n"
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 5d4e1ac..6484193 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -289,12 +289,17 @@
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- sub edi, esi
shr eax, 1
- cmp eax, 0
- je xloop1
+ cmp eax, 0 // dispatch to specialized filters if applicable.
+ je xloop100
+ sub edi, esi
+ cmp eax, 32
+ je xloop75
cmp eax, 64
- je xloop2
+ je xloop50
+ cmp eax, 96
+ je xloop25
+
movd xmm0, eax // high fraction 0..127
neg eax
add eax, 128
@@ -319,36 +324,57 @@
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
jg xloop
+ jmp xloop99
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
- pop edi
- pop esi
- ret
-
+ // Blend 25 / 75.
align 16
- xloop1:
+ xloop25:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 4
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
movdqa xmm0, [esi]
sub ecx, 4
movdqa [esi + edi], xmm0
lea esi, [esi + 16]
- jg xloop1
+ jg xloop100
- shufps xmm0, xmm0, 0xff
- movdqa [esi + edi], xmm0
- pop edi
- pop esi
- ret
-
- align 16
- xloop2:
- movdqa xmm0, [esi]
- pavgb xmm0, [esi + edx]
- sub ecx, 4
- movdqa [esi + edi], xmm0
- lea esi, [esi + 16]
- jg xloop2
-
+ // Extrude last pixel.
+ xloop99:
shufps xmm0, xmm0, 0xff
movdqa [esi + edi], xmm0
pop edi
@@ -585,12 +611,17 @@
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
- "sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
- "je 2f \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
"cmp $0x40,%3 \n"
- "je 3f \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
@@ -598,6 +629,8 @@
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
".p2align 4 \n"
"1: \n"
"movdqa (%1),%%xmm0 \n"
@@ -614,30 +647,62 @@
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
"jg 1b \n"
- "jmp 4f \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
".p2align 4 \n"
- "2: \n"
+ "25: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqa (%1),%%xmm1 \n"
+ "movdqa (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
"movdqa (%1),%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
"lea 0x10(%1),%1 \n"
- "jg 2b \n"
- "jmp 4f \n"
- ".p2align 4 \n"
- "3: \n"
- "movdqa (%1),%%xmm0 \n"
- "pavgb (%1,%4,1),%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 3b \n"
- "4: \n"
- ".p2align 4 \n"
+ "jg 100b \n"
+
+ // Extrude last pixel.
+ "99: \n"
"shufps $0xff,%%xmm0,%%xmm0 \n"
"movdqa %%xmm0,(%1,%0,1) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
@@ -645,6 +710,7 @@
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
+
}
#endif // defined(__x86_64__) || defined(__i386__)
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 4af3c15..f521c63 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -477,14 +477,19 @@
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
- "beq 2f \n"
+ "beq 100f \n"
"add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
"cmp %4, #128 \n"
- "beq 3f \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
+ // General purpose row blend.
"1: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
@@ -497,23 +502,48 @@
"vrshrn.u16 d1, q14, #8 \n"
"vst1.u8 {q0}, [%0]! \n"
"bgt 1b \n"
- "b 4f \n"
+ "b 99f \n"
- "2: \n"
+ // Blend 25 / 75.
+ "25: \n"
"vld1.u8 {q0}, [%1]! \n"
+ "vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
- "bgt 2b \n"
- "b 4f \n"
+ "bgt 25b \n"
+ "b 99f \n"
- "3: \n"
+ // Blend 50 / 50.
+ "50: \n"
"vld1.u8 {q0}, [%1]! \n"
"vld1.u8 {q1}, [%2]! \n"
"subs %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.u8 {q0}, [%0]! \n"
- "bgt 3b \n"
- "4: \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.u8 {q1}, [%1]! \n"
+ "vld1.u8 {q0}, [%2]! \n"
+ "subs %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.u8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.u8 {q0}, [%1]! \n"
+ "subs %3, #16 \n"
+ "vst1.u8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
"vst1.u8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1