argb scale 2x upsample with specialization for 25/75%
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/938014

git-svn-id: http://libyuv.googlecode.com/svn/trunk@486 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index 97118c9..2793fa7 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 485
+Version: 486
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index c03128b..2a1468d 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 485
+#define LIBYUV_VERSION 486
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale.cc b/source/scale.cc
index e6484af..b6eaa35 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1035,24 +1035,26 @@
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
     shr        eax, 1
     cmp        eax, 0  // dispatch to specialized filters if applicable.
     je         xloop100
+    sub        edi, esi
     cmp        eax, 32
     je         xloop75
     cmp        eax, 64
     je         xloop50
     cmp        eax, 96
     je         xloop25
-    movd       xmm0, eax  // high fraction 0..127
+
+    movd       xmm0, eax  // high fraction 1..127.
     neg        eax
     add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
+    movd       xmm5, eax  // low fraction 127..1.
     punpcklbw  xmm5, xmm0
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
+    // General purpose row blend.
     align      16
   xloop:
     movdqa     xmm0, [esi]
@@ -1069,71 +1071,7 @@
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      16
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 75 / 25.
-    align      16
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 50 / 50.
-    align      16
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+    jmp        xloop99
 
     // Blend 25 / 75.
     align      16
@@ -1146,7 +1084,44 @@
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
+    jmp        xloop99
 
+    // Blend 50 / 50.
+    align      16
+  xloop50:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqa     xmm1, [esi]
+    movdqa     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      16
+  xloop100:
+    movdqa     xmm0, [esi]
+    sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop100
+
+    // Extrude last pixel.
+  xloop99:
     punpckhbw  xmm0, xmm0
     pshufhw    xmm0, xmm0, 0xff
     punpckhqdq xmm0, xmm0
@@ -1154,7 +1129,6 @@
     pop        edi
     pop        esi
     ret
-
   }
 }
 
@@ -1171,29 +1145,31 @@
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
     shr        eax, 1
-    cmp        eax, 0
+    cmp        eax, 0  // dispatch to specialized filters if applicable.
     je         xloop100
+    sub        edi, esi
     cmp        eax, 32
     je         xloop75
     cmp        eax, 64
     je         xloop50
     cmp        eax, 96
     je         xloop25
-    movd       xmm0, eax  // high fraction 0..127
+
+    movd       xmm0, eax  // high fraction 1..127.
     neg        eax
     add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
+    movd       xmm5, eax  // low fraction 127..1.
     punpcklbw  xmm5, xmm0
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
+    // General purpose row blend.
     align      16
   xloop:
     movdqu     xmm0, [esi]
     movdqu     xmm2, [esi + edx]
-    movdqu     xmm1, xmm0
+    movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
     pmaddubsw  xmm0, xmm5
@@ -1205,71 +1181,7 @@
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
-
-    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqu     [esi + edi], xmm0
-
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      16
-  xloop100:
-    movdqu     xmm0, [esi]
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqu     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 75 / 25.
-    align      16
-  xloop75:
-    movdqu     xmm1, [esi]
-    movdqu     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqu     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    // Blend 50 / 50.
-    align      16
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqu     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
+    jmp        xloop99
 
     // Blend 25 / 75.
     align      16
@@ -1282,7 +1194,44 @@
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
+    jmp        xloop99
 
+    // Blend 50 / 50.
+    align      16
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      16
+  xloop100:
+    movdqu     xmm0, [esi]
+    sub        ecx, 16
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop100
+
+    // Extrude last pixel.
+  xloop99:
     punpckhbw  xmm0, xmm0
     pshufhw    xmm0, xmm0, 0xff
     punpckhqdq xmm0, xmm0
@@ -1290,7 +1239,6 @@
     pop        edi
     pop        esi
     ret
-
   }
 }
 
@@ -2068,9 +2016,13 @@
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
     "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
     "movd      %3,%%xmm0                       \n"
     "neg       %3                              \n"
     "add       $0x80,%3                        \n"
@@ -2078,6 +2030,8 @@
     "punpcklbw %%xmm0,%%xmm5                   \n"
     "punpcklwd %%xmm5,%%xmm5                   \n"
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%1),%%xmm0                     \n"
@@ -2094,25 +2048,57 @@
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
     ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    ".p2align  4                               \n"
+  "50:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    ".p2align  4                               \n"
+  "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
     "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
+    "jg        100b                            \n"
+
+    // Extrude last pixel.
+  "99:                                         \n"
     "punpckhbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
     "punpckhqdq %%xmm0,%%xmm0                  \n"
@@ -2137,9 +2123,13 @@
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
     "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
     "movd      %3,%%xmm0                       \n"
     "neg       %3                              \n"
     "add       $0x80,%3                        \n"
@@ -2147,11 +2137,13 @@
     "punpcklbw %%xmm0,%%xmm5                   \n"
     "punpcklwd %%xmm5,%%xmm5                   \n"
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%1),%%xmm0                     \n"
     "movdqu    (%1,%4,1),%%xmm2                \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
     "pmaddubsw %%xmm5,%%xmm0                   \n"
@@ -2163,25 +2155,57 @@
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
     ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
+    "movdqu    (%1),%%xmm0                     \n"
+    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    ".p2align  4                               \n"
+  "50:                                         \n"
+    "movdqu    (%1),%%xmm0                     \n"
+    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqu    (%1),%%xmm1                     \n"
+    "movdqu    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqu    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    ".p2align  4                               \n"
+  "100:                                        \n"
     "movdqu    (%1),%%xmm0                     \n"
     "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
+    "jg        100b                            \n"
+
+    // Extrude last pixel.
+  "99:                                         \n"
     "punpckhbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
     "punpckhqdq %%xmm0,%%xmm0                  \n"
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 5d4e1ac..6484193 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -289,12 +289,17 @@
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
     shr        eax, 1
-    cmp        eax, 0
-    je         xloop1
+    cmp        eax, 0  // dispatch to specialized filters if applicable.
+    je         xloop100
+    sub        edi, esi
+    cmp        eax, 32
+    je         xloop75
     cmp        eax, 64
-    je         xloop2
+    je         xloop50
+    cmp        eax, 96
+    je         xloop25
+
     movd       xmm0, eax  // high fraction 0..127
     neg        eax
     add        eax, 128
@@ -319,36 +324,57 @@
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
+    jmp        xloop99
 
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
-    pop        edi
-    pop        esi
-    ret
-
+    // Blend 25 / 75.
     align      16
-  xloop1:
+  xloop25:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+    align      16
+  xloop50:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqa     xmm1, [esi]
+    movdqa     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      16
+  xloop100:
     movdqa     xmm0, [esi]
     sub        ecx, 4
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
-    jg         xloop1
+    jg         xloop100
 
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
-
+    // Extrude last pixel.
+  xloop99:
     shufps     xmm0, xmm0, 0xff
     movdqa     [esi + edi], xmm0
     pop        edi
@@ -585,12 +611,17 @@
                                ptrdiff_t src_stride, int dst_width,
                                int source_y_fraction) {
   asm volatile (
-    "sub       %1,%0                           \n"
     "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
     "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
     "movd      %3,%%xmm0                       \n"
     "neg       %3                              \n"
     "add       $0x80,%3                        \n"
@@ -598,6 +629,8 @@
     "punpcklbw %%xmm0,%%xmm5                   \n"
     "punpcklwd %%xmm5,%%xmm5                   \n"
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%1),%%xmm0                     \n"
@@ -614,30 +647,62 @@
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
     ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    ".p2align  4                               \n"
+  "50:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    ".p2align  4                               \n"
+  "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
     "sub       $0x4,%2                         \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-  "4:                                          \n"
-    ".p2align  4                               \n"
+    "jg        100b                            \n"
+
+    // Extrude last pixel.
+  "99:                                         \n"
     "shufps    $0xff,%%xmm0,%%xmm0             \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width),   // %2
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
@@ -645,6 +710,7 @@
     , "xmm0", "xmm1", "xmm2", "xmm5"
 #endif
   );
+
 }
 #endif  // defined(__x86_64__) || defined(__i386__)
 
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 4af3c15..f521c63 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -477,14 +477,19 @@
                           int dst_width, int source_y_fraction) {
   asm volatile (
     "cmp          %4, #0                       \n"
-    "beq          2f                           \n"
+    "beq          100f                         \n"
     "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
     "cmp          %4, #128                     \n"
-    "beq          3f                           \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
 
     "vdup.8       d5, %4                       \n"
     "rsb          %4, #256                     \n"
     "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
   "1:                                          \n"
     "vld1.u8      {q0}, [%1]!                  \n"
     "vld1.u8      {q1}, [%2]!                  \n"
@@ -497,23 +502,48 @@
     "vrshrn.u16   d1, q14, #8                  \n"
     "vst1.u8      {q0}, [%0]!                  \n"
     "bgt          1b                           \n"
-    "b            4f                           \n"
+    "b            99f                          \n"
 
-  "2:                                          \n"
+    // Blend 25 / 75.
+  "25:                                         \n"
     "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
     "subs         %3, #16                      \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
     "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          2b                           \n"
-    "b            4f                           \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
 
-  "3:                                          \n"
+    // Blend 50 / 50.
+  "50:                                         \n"
     "vld1.u8      {q0}, [%1]!                  \n"
     "vld1.u8      {q1}, [%2]!                  \n"
     "subs         %3, #16                      \n"
     "vrhadd.u8    q0, q1                       \n"
     "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          3b                           \n"
-  "4:                                          \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    "vld1.u8      {q1}, [%1]!                  \n"
+    "vld1.u8      {q0}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "subs         %3, #16                      \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
     "vst1.u8      {d1[7]}, [%0]                \n"
     : "+r"(dst_ptr),          // %0
       "+r"(src_ptr),          // %1