source/scale_argb.cc - platform/external/libyuv - Git at Google

 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS. All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "libyuv/scale.h"

 #include <assert.h>
 #include <string.h>

 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"

 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif

 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }

 // ARGB scaling uses bilinear or point, but not box filter.
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                             uint8* dst, int dst_width);
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width);
 #endif

 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 #define HAS_SCALEARGBROWDOWN2_SSE2
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                                    ptrdiff_t /* src_stride */,
                                    uint8* dst_argb, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_argb
                                      // src_stride ignored
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width

     align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     shufps     xmm0, xmm1, 0xdd
     sub        ecx, 4
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         wloop

     ret
   }
 }

 // Blends 8x2 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                       ptrdiff_t src_stride,
                                       uint8* dst_argb, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_argb
     mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // dst_argb
     mov        ecx, [esp + 4 + 16]   // dst_width

     align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
     movdqa     xmm2, [eax + esi]
     movdqa     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
     sub        ecx, 4
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         wloop

     pop        esi
     ret
   }
 }

 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       edi
     mov        eax, [esp + 8 + 4]    // src_argb
                                      // src_stride ignored
     mov        ebx, [esp + 8 + 12]   // src_stepx
     mov        edx, [esp + 8 + 16]   // dst_argb
     mov        ecx, [esp + 8 + 20]   // dst_width
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]

     align      16
   wloop:
     movd       xmm0, [eax]
     movd       xmm1, [eax + ebx]
     punpckldq  xmm0, xmm1
     movd       xmm2, [eax + ebx * 2]
     movd       xmm3, [eax + edi]
     lea        eax,  [eax + ebx * 4]
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
     sub        ecx, 4
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         wloop

     pop        edi
     pop        ebx
     ret
   }
 }

 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                          ptrdiff_t src_stride,
                                          int src_stepx,
                                          uint8* dst_argb, int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
     mov        eax, [esp + 12 + 4]    // src_argb
     mov        esi, [esp + 12 + 8]    // src_stride
     mov        ebx, [esp + 12 + 12]   // src_stepx
     mov        edx, [esp + 12 + 16]   // dst_argb
     mov        ecx, [esp + 12 + 20]   // dst_width
     lea        esi, [eax + esi]       // row1 pointer
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]

     align      16
   wloop:
     movq       xmm0, qword ptr [eax]  // row0 4 pairs
     movhps     xmm0, qword ptr [eax + ebx]
     movq       xmm1, qword ptr [eax + ebx * 2]
     movhps     xmm1, qword ptr [eax + edi]
     lea        eax,  [eax + ebx * 4]
     movq       xmm2, qword ptr [esi]  // row1 4 pairs
     movhps     xmm2, qword ptr [esi + ebx]
     movq       xmm3, qword ptr [esi + ebx * 2]
     movhps     xmm3, qword ptr [esi + edi]
     lea        esi,  [esi + ebx * 4]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
     sub        ecx, 4
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
     jg         wloop

     pop        edi
     pop        esi
     pop        ebx
     ret
   }
 }

 // Column scaling unfiltered. SSSE3 version.
 // TODO(fbarchard): Port to Neon

 #define HAS_SCALEARGBCOLS_SSE2
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
     pextrw     eax, xmm2, 1          // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29

     movdqa     xmm0, xmm2           // x1 = x0 + dx
     paddd      xmm0, xmm3
     punpckldq  xmm2, xmm0           // x0 x1
     punpckldq  xmm3, xmm3           // dx dx
     paddd      xmm3, xmm3           // dx * 2, dx * 2
     pextrw     edx, xmm2, 3         // get x1 integer. preroll

     // 2 Pixel loop.
     align      16
   xloop2:
     paddd      xmm2, xmm3           // x += dx
     movd       xmm0, qword ptr [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, qword ptr [esi + edx * 4]  // 1 source x1 pixels
     punpckldq  xmm0, xmm1           // x0 x1
     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
     sub        ecx, 2               // 2 pixels
     jge        xloop2
  xloop29:

     add        ecx, 2 - 1
     jl         xloop99

     // 1 pixel remainder
     movd       xmm0, qword ptr [esi + eax * 4]  // 1 source x0 pixels
     movd       [edi], xmm0
  xloop99:

     pop        edi
     pop        esi
     ret
   }
 }

 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
 // TODO(fbarchard): Port to Neon

 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 static const uvec8 kShuffleColARGB = {
   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };

 // Shuffle table for duplicating 2 fractions into 8 bytes each
 static const uvec8 kShuffleFractions = {
   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };

 #define HAS_SCALEARGBFILTERCOLS_SSSE3
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                       int dst_width, int x, int dx) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]    // dst_argb
     mov        esi, [esp + 8 + 8]    // src_argb
     mov        ecx, [esp + 8 + 12]   // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
     movdqa     xmm4, kShuffleColARGB
     movdqa     xmm5, kShuffleFractions
     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
     pextrw     eax, xmm2, 1         // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29

     movdqa     xmm0, xmm2           // x1 = x0 + dx
     paddd      xmm0, xmm3
     punpckldq  xmm2, xmm0           // x0 x1
     punpckldq  xmm3, xmm3           // dx dx
     paddd      xmm3, xmm3           // dx * 2, dx * 2
     pextrw     edx, xmm2, 3         // get x1 integer. preroll

     // 2 Pixel loop.
     align      16
   xloop2:
     movdqa     xmm1, xmm2           // x0, x1 fractions.
     paddd      xmm2, xmm3           // x += dx
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
     psrlw      xmm1, 9              // 7 bit fractions.
     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
     pshufb     xmm1, xmm5           // 0000000011111111
     pshufb     xmm0, xmm4           // arrange pixels into pairs
     pxor       xmm1, xmm6           // 0..7f and 7f..0
     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
     sub        ecx, 2               // 2 pixels
     jge        xloop2
  xloop29:

     add        ecx, 2 - 1
     jl         xloop99

     // 1 pixel remainder
     psrlw      xmm2, 9              // 7 bit fractions.
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
     pshufb     xmm2, xmm5           // 00000000
     pshufb     xmm0, xmm4           // arrange pixels into pairs
     pxor       xmm2, xmm6           // 0..7f and 7f..0
     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
  xloop99:

     pop        edi
     pop        esi
     ret
   }
 }

 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 #define HAS_SCALEARGBROWDOWN2_SSE2
 static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                                    ptrdiff_t /* src_stride */,
                                    uint8* dst_argb, int dst_width) {
   asm volatile (
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
     "lea       0x20(%0),%0                     \n"
     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
     "sub       $0x4,%2                         \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(dst_width)  // %2
   :
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1"
 #endif
   );
 }

 static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                       ptrdiff_t src_stride,
                                       uint8* dst_argb, int dst_width) {
   asm volatile (
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
     "movdqa    (%0,%3,1),%%xmm2                \n"
     "movdqa    0x10(%0,%3,1),%%xmm3            \n"
     "lea       0x20(%0),%0                     \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(dst_width)   // %2
   : "r"(static_cast<intptr_t>(src_stride))   // %3
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }

 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width) {
   intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
   intptr_t src_stepx_x12 = 0;
   asm volatile (
     "lea       0x0(,%1,4),%1                   \n"
     "lea       (%1,%1,2),%4                    \n"
     ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%0),%%xmm0                     \n"
     "movd      (%0,%1,1),%%xmm1                \n"
     "punpckldq %%xmm1,%%xmm0                   \n"
     "movd      (%0,%1,2),%%xmm2                \n"
     "movd      (%0,%4,1),%%xmm3                \n"
     "lea       (%0,%1,4),%0                    \n"
     "punpckldq %%xmm3,%%xmm2                   \n"
     "punpcklqdq %%xmm2,%%xmm0                  \n"
     "sub       $0x4,%3                         \n"
     "movdqa    %%xmm0,(%2)                     \n"
     "lea       0x10(%2),%2                     \n"
     "jg        1b                              \n"
   : "+r"(src_argb),      // %0
     "+r"(src_stepx_x4),  // %1
     "+r"(dst_argb),      // %2
     "+r"(dst_width),     // %3
     "+r"(src_stepx_x12)  // %4
   :
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }

 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
 static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                          ptrdiff_t src_stride, int src_stepx,
                                          uint8* dst_argb, int dst_width) {
   intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
   intptr_t src_stepx_x12 = 0;
   intptr_t row1 = static_cast<intptr_t>(src_stride);
   asm volatile (
     "lea       0x0(,%1,4),%1                   \n"
     "lea       (%1,%1,2),%4                    \n"
     "lea       (%0,%5,1),%5                    \n"
     ".p2align  4                               \n"
   "1:                                          \n"
     "movq      (%0),%%xmm0                     \n"
     "movhps    (%0,%1,1),%%xmm0                \n"
     "movq      (%0,%1,2),%%xmm1                \n"
     "movhps    (%0,%4,1),%%xmm1                \n"
     "lea       (%0,%1,4),%0                    \n"
     "movq      (%5),%%xmm2                     \n"
     "movhps    (%5,%1,1),%%xmm2                \n"
     "movq      (%5,%1,2),%%xmm3                \n"
     "movhps    (%5,%4,1),%%xmm3                \n"
     "lea       (%5,%1,4),%5                    \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
     "movdqa    %%xmm0,(%2)                     \n"
     "lea       0x10(%2),%2                     \n"
     "jg        1b                              \n"
   : "+r"(src_argb),       // %0
     "+r"(src_stepx_x4),   // %1
     "+r"(dst_argb),       // %2
     "+rm"(dst_width),     // %3
     "+r"(src_stepx_x12),  // %4
     "+r"(row1)            // %5
   :
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }

 #define HAS_SCALEARGBCOLS_SSE2
 static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                                int dst_width, int x, int dx) {
   intptr_t x0 = 0, x1 = 0;
   asm volatile (
     "movd      %5,%%xmm2                       \n"
     "movd      %6,%%xmm3                       \n"
     "pextrw    $0x1,%%xmm2,%k3                 \n"
     "sub       $0x2,%2                         \n"
     "jl        29f                             \n"
     "movdqa    %%xmm2,%%xmm0                   \n"
     "paddd     %%xmm3,%%xmm0                   \n"
     "punpckldq %%xmm0,%%xmm2                   \n"
     "punpckldq %%xmm3,%%xmm3                   \n"
     "paddd     %%xmm3,%%xmm3                   \n"
     "pextrw    $0x3,%%xmm2,%k4                 \n"

     ".p2align  4                               \n"
   "2:                                          \n"
     "paddd     %%xmm3,%%xmm2                   \n"
     "movd      (%1,%3,4),%%xmm0                \n"
     "movd      (%1,%4,4),%%xmm1                \n"
     "punpckldq %%xmm1,%%xmm0                   \n"
     "pextrw    $0x1,%%xmm2,%k3                 \n"
     "pextrw    $0x3,%%xmm2,%k4                 \n"
     "movq      %%xmm0,(%0)                     \n"
     "lea       0x8(%0),%0                      \n"
     "sub       $0x2,%2                         \n"
     "jge       2b                              \n"

   "29:                                         \n"
     "add       $0x1,%2                         \n"
     "jl        99f                             \n"
     "movd      (%1,%3,4),%%xmm0                \n"
     "movd      %%xmm0,(%0)                     \n"
   "99:                                         \n"
   : "+r"(dst_argb),    // %0
     "+r"(src_argb),    // %1
     "+rm"(dst_width),  // %2
     "+r"(x0),          // %3
     "+r"(x1)           // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }

 #ifdef __APPLE__
 #define CONST
 #else
 #define CONST static const
 #endif

 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 CONST uvec8 kShuffleColARGB = {
   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };

 // Shuffle table for duplicating 2 fractions into 8 bytes each
 CONST uvec8 kShuffleFractions = {
   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };

 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
 static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                                       int dst_width, int x, int dx) {
   intptr_t x0 = 0, x1 = 0;
   asm volatile (
     "movdqa    %0,%%xmm4                       \n"
     "movdqa    %1,%%xmm5                       \n"
   :
   : "m"(kShuffleColARGB),  // %0
     "m"(kShuffleFractions)  // %1
   );

   asm volatile (
     "movd      %5,%%xmm2                       \n"
     "movd      %6,%%xmm3                       \n"
     "pcmpeqb   %%xmm6,%%xmm6                   \n"
     "psrlw     $0x9,%%xmm6                     \n"
     "pextrw    $0x1,%%xmm2,%k3                 \n"
     "sub       $0x2,%2                         \n"
     "jl        29f                             \n"
     "movdqa    %%xmm2,%%xmm0                   \n"
     "paddd     %%xmm3,%%xmm0                   \n"
     "punpckldq %%xmm0,%%xmm2                   \n"
     "punpckldq %%xmm3,%%xmm3                   \n"
     "paddd     %%xmm3,%%xmm3                   \n"
     "pextrw    $0x3,%%xmm2,%k4                 \n"

     ".p2align  4                               \n"
   "2:                                          \n"
     "movdqa    %%xmm2,%%xmm1                   \n"
     "paddd     %%xmm3,%%xmm2                   \n"
     "movq      (%1,%3,4),%%xmm0                \n"
     "psrlw     $0x9,%%xmm1                     \n"
     "movhps    (%1,%4,4),%%xmm0                \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
     "pxor      %%xmm6,%%xmm1                   \n"
     "pmaddubsw %%xmm1,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "pextrw    $0x1,%%xmm2,%k3                 \n"
     "pextrw    $0x3,%%xmm2,%k4                 \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
     "movq      %%xmm0,(%0)                     \n"
     "lea       0x8(%0),%0                      \n"
     "sub       $0x2,%2                         \n"
     "jge       2b                              \n"

   "29:                                         \n"
     "add       $0x1,%2                         \n"
     "jl        99f                             \n"
     "psrlw     $0x9,%%xmm2                     \n"
     "movq      (%1,%3,4),%%xmm0                \n"
     "pshufb    %%xmm5,%%xmm2                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
     "pxor      %%xmm6,%%xmm2                   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
     "movd      %%xmm0,(%0)                     \n"
   "99:                                         \n"
   : "+r"(dst_argb),    // %0
     "+r"(src_argb),    // %1
     "+rm"(dst_width),  // %2
     "+r"(x0),          // %3
     "+r"(x1)           // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
   );
 }
 #endif  // defined(__x86_64__) || defined(__i386__)

 static void ScaleARGBRowDown2_C(const uint8* src_argb,
                                 ptrdiff_t /* src_stride */,
                                 uint8* dst_argb, int dst_width) {
   const uint32* src = reinterpret_cast<const uint32*>(src_argb);
   uint32* dst = reinterpret_cast<uint32*>(dst_argb);

   for (int x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[1];
     dst[1] = src[3];
     src += 4;
     dst += 2;
   }
   if (dst_width & 1) {
     dst[0] = src[1];
   }
 }

 static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
                                    uint8* dst_argb, int dst_width) {
   for (int x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] +
                   src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
     dst_argb[1] = (src_argb[1] + src_argb[5] +
                   src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
     dst_argb[2] = (src_argb[2] + src_argb[6] +
                   src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
     dst_argb[3] = (src_argb[3] + src_argb[7] +
                   src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
     src_argb += 8;
     dst_argb += 4;
   }
 }

 void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
                             int src_stepx,
                             uint8* dst_argb, int dst_width) {
   const uint32* src = reinterpret_cast<const uint32*>(src_argb);
   uint32* dst = reinterpret_cast<uint32*>(dst_argb);

   for (int x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
     dst[1] = src[src_stepx];
     src += src_stepx * 2;
     dst += 2;
   }
   if (dst_width & 1) {
     dst[0] = src[0];
   }
 }

 static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
                                       uint8* dst_argb, int dst_width) {
   for (int x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] +
                   src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
     dst_argb[1] = (src_argb[1] + src_argb[5] +
                   src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
     dst_argb[2] = (src_argb[2] + src_argb[6] +
                   src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
     dst_argb[3] = (src_argb[3] + src_argb[7] +
                   src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
     src_argb += src_stepx * 4;
     dst_argb += 4;
   }
 }

 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) static_cast<uint32>( \
     BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
 #define BLENDER(a, b, f) \
     BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
     BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)

 static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
                                   int dst_width, int x, int dx) {
   const uint32* src = reinterpret_cast<const uint32*>(src_argb);
   uint32* dst = reinterpret_cast<uint32*>(dst_argb);
   for (int j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
     uint32 a = src[xi];
     uint32 b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
     xf = (x >> 9) & 0x7f;
     a = src[xi];
     b = src[xi + 1];
     dst[1] = BLENDER(a, b, xf);
     x += dx;
     dst += 2;
   }
   if (dst_width & 1) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
     uint32 a = src[xi];
     uint32 b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }

 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.

 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
                            const uint8* src_argb, uint8* dst_argb,
                            int x, int dx, int y, int dy,
                            FilterMode filtering) {
   assert(dx == 65536 * 2);  // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row / even column.
   if (filtering) {
     src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
   } else {
     src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
   }
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
     ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
         ScaleARGBRowDown2_SSE2;
   }
 #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
     ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
         ScaleARGBRowDown2_NEON;
   }
 #endif

   // TODO(fbarchard): Loop through source height to allow odd height.
   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
   }
 }

 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
 static void ScaleARGBDownEven(int src_width, int src_height,
                               int dst_width, int dst_height,
                               int src_stride, int dst_stride,
                               const uint8* src_argb, uint8* dst_argb,
                               int x, int dx, int y, int dy,
                               FilterMode filtering) {
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
   int col_step = dx >> 16;
   int row_stride = (dy >> 16) * src_stride;
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
   void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
         ScaleARGBRowDownEven_SSE2;
   }
 #elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(src_argb, 4)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
         ScaleARGBRowDownEven_NEON;
   }
 #endif

   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
   }
 }

 // ScaleARGB ARGB to/from any dimensions, with bilinear interpolation.
 static void ScaleARGBBilinearDown(int src_height,
                                   int dst_width, int dst_height,
                                   int src_stride, int dst_stride,
                                   const uint8* src_argb, uint8* dst_argb,
                                   int x, int dx, int y, int dy) {
   assert(src_height > 0);
   assert(dst_width > 0);
   assert(dst_height > 0);
   int xlast = x + (dst_width - 1) * dx;
   int xl = (dx >= 0) ? x : xlast;
   int xr = (dx >= 0) ? xlast : x;
   xl = (xl >> 16) & ~3;  // Left edge aligned.
   xr = (xr >> 16) + 1;  // Right most pixel used.
   int clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4;  // Width aligned to 4.
   src_argb += xl * 4;
   x -= (xl << 16);
   assert(clip_src_width <= kMaxStride);
   // TODO(fbarchard): Remove clip_src_width alignment checks.
   SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
   void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(clip_src_width, 16)) {
       InterpolateRow = InterpolateRow_Unaligned_SSE2;
       if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
         InterpolateRow = InterpolateRow_SSE2;
       }
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(clip_src_width, 16)) {
       InterpolateRow = InterpolateRow_Unaligned_SSSE3;
       if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
         InterpolateRow = InterpolateRow_SSSE3;
       }
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(clip_src_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(clip_src_width, 4)) {
       InterpolateRow = InterpolateRow_MIPS_DSPR2;
     }
   }
 #endif
   void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
       int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
   int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   for (int j = 0; j < dst_height; ++j) {
     if (y > maxy) {
       y = maxy;
     }
     int yi = y >> 16;
     int yf = (y >> 8) & 255;
     const uint8* src = src_argb + yi * src_stride;
     InterpolateRow(row, src, src_stride, clip_src_width, yf);
     ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
   }
 }

 // ScaleARGB ARGB to/from any dimensions, with bilinear interpolation.
 static void ScaleARGBBilinearUp(int src_width, int src_height,
                                 int dst_width, int dst_height,
                                 int src_stride, int dst_stride,
                                 const uint8* src_argb, uint8* dst_argb,
                                 int x, int dx, int y, int dy) {
   assert(src_width > 0);
   assert(src_height > 0);
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(dst_width * 4 <= kMaxStride);
   void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_Unaligned_SSE2;
       if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
         InterpolateRow = InterpolateRow_SSE2;
       }
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_Unaligned_SSSE3;
       if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
         InterpolateRow = InterpolateRow_SSSE3;
       }
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
 #endif
   void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
       int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
   int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   if (y > maxy) {
     y = maxy;
   }
   int yi = y >> 16;
   const uint8* src = src_argb + yi * src_stride;
   SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
   uint8* rowptr = row;
   int rowstride = kMaxStride;
   int lasty = yi;

   ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
   if (src_height > 1) {
     src += src_stride;
   }
   ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
   src += src_stride;

   for (int j = 0; j < dst_height; ++j) {
     yi = y >> 16;
     if (yi != lasty) {
       if (y <= maxy) {
         ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
         rowptr += rowstride;
         rowstride = -rowstride;
         lasty = yi;
         src += src_stride;
       }
     }
     int yf = (y >> 8) & 255;
     InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }

 // Scales a single row of pixels using point sampling.
 // Code is adapted from libyuv bilinear yuv scaling, but with bilinear
 // interpolation off, and argb pixels instead of yuv.
 static void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
                             int dst_width, int x, int dx) {
   const uint32* src = reinterpret_cast<const uint32*>(src_argb);
   uint32* dst = reinterpret_cast<uint32*>(dst_argb);
   for (int j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
     x += dx;
     dst[1] = src[x >> 16];
     x += dx;
     dst += 2;
   }
   if (dst_width & 1) {
     dst[0] = src[x >> 16];
   }
 }

 // ScaleARGB ARGB to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.

 static void ScaleARGBSimple(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint8* src_argb, uint8* dst_argb,
                             int x, int dx, int y, int dy) {
   void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
       int dst_width, int x, int dx) = ScaleARGBCols_C;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
   }
 #endif

   for (int i = 0; i < dst_height; ++i) {
     ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
                   dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
   }
 }

 // ScaleARGB ARGB to/from any dimensions.
 static void ScaleARGBAnySize(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int clip_width, int clip_height,
                              int src_stride, int dst_stride,
                              const uint8* src_argb, uint8* dst_argb,
                              int x, int dx, int y, int dy,
                              FilterMode filtering) {
   if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
     ScaleARGBBilinearUp(src_width, src_height,
                         clip_width, clip_height,
                         src_stride, dst_stride, src_argb, dst_argb,
                         x, dx, y, dy);
     return;
   }
   if (filtering && src_width * 4 < kMaxStride) {
     ScaleARGBBilinearDown(src_height,
                           clip_width, clip_height,
                           src_stride, dst_stride, src_argb, dst_argb,
                           x, dx, y, dy);
     return;
   }
   ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
                   src_stride, dst_stride, src_argb, dst_argb,
                   x, dx, y, dy);
 }

 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
 static void ScaleARGB(const uint8* src, int src_stride,
                       int src_width, int src_height,
                       uint8* dst, int dst_stride,
                       int dst_width, int dst_height,
                       int clip_x, int clip_y, int clip_width, int clip_height,
                       FilterMode filtering) {
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
     src = src + (src_height - 1) * src_stride;
     src_stride = -src_stride;
   }
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int dx = 0;
   int dy = 0;
   int x = 0;
   int y = 0;
   if (filtering) {
     // Scale step for bilinear sampling renders last pixel once for upsample.
     if (dst_width <= Abs(src_width)) {
       dx = (Abs(src_width) << 16) / dst_width;
       x = (dx >> 1) - 32768;
     } else if (dst_width > 1) {
       dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
     }
     if (dst_height <= src_height) {
       dy = (src_height << 16) / dst_height;
       y = (dy >> 1) - 32768;
     } else if (dst_height > 1) {
       dy = ((src_height - 1) << 16) / (dst_height - 1);
     }
   } else {
     // Scale step for point sampling duplicates all pixels equally.
     dx = (Abs(src_width) << 16) / dst_width;
     dy = (src_height << 16) / dst_height;
     x = dx >> 1;
     y = dy >> 1;
   }
   // Negative src_width means horizontally mirror.
   if (src_width < 0) {
     x += (dst_width - 1) * dx;
     dx = -dx;
     src_width = -src_width;
   }
   if (clip_x) {
     x += clip_x * dx;
     dst += clip_x * 4;
   }
   if (clip_y) {
     y += clip_y * dy;
     dst += clip_y * dst_stride;
   }

   // Special case for integer step values.
   if (((dx | dy) & 0xffff) == 0) {
     if (!dx || !dy) {
       filtering = kFilterNone;
     } else {
       // Optimized even scale down. ie 2, 4, 6, 8, 10x.
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
         if ((dx >> 16) == 2) {
           // Optimized 1/2 horizontal.
           ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
                          src_stride, dst_stride, src, dst,
                          x, dx, y, dy, filtering);
           return;
         }
         ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
                           src_stride, dst_stride, src, dst,
                           x, dx, y, dy, filtering);
         return;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
       if ((dx & 0x10000) && (dy & 0x10000)) {
         filtering = kFilterNone;
         if (dst_width == src_width && dst_height == src_height) {
           // Straight copy.
           ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
                    dst, dst_stride, clip_width, clip_height);
           return;
         }
       }
     }
   }
   // Arbitrary scale up and/or down.
   ScaleARGBAnySize(src_width, src_height,
                    dst_width, dst_height,
                    clip_width, clip_height,
                    src_stride, dst_stride, src, dst, x, dx, y, dy, filtering);
 }

 LIBYUV_API
 int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
                   int src_width, int src_height,
                   uint8* dst_argb, int dst_stride_argb,
                   int dst_width, int dst_height,
                   int clip_x, int clip_y, int clip_width, int clip_height,
                   enum FilterMode filtering) {
   if (!src_argb || src_width == 0 || src_height == 0 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0 ||
       clip_x < 0 || clip_y < 0 ||
       src_width > 32767 || src_height > 32767 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
   ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
             dst_argb, dst_stride_argb, dst_width, dst_height,
             clip_x, clip_y, clip_width, clip_height, filtering);
   return 0;
 }

 // Scale an ARGB image.
 LIBYUV_API
 int ARGBScale(const uint8* src_argb, int src_stride_argb,
               int src_width, int src_height,
               uint8* dst_argb, int dst_stride_argb,
               int dst_width, int dst_height,
               FilterMode filtering) {
   if (!src_argb || src_width == 0 || src_height == 0 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0 ||
       src_width > 32767 || src_height > 32767) {
     return -1;
   }
   ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
             dst_argb, dst_stride_argb, dst_width, dst_height,
             0, 0, dst_width, dst_height, filtering);
   return 0;
 }

 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif